2019-06-03 13:44:50 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2012-03-05 19:49:27 +08:00
|
|
|
/*
|
|
|
|
* Based on arch/arm/mm/mmu.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1995-2005 Russell King
|
|
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
|
|
*/
|
|
|
|
|
2016-08-15 14:45:46 +08:00
|
|
|
#include <linux/cache.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <linux/export.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/init.h>
|
2017-04-03 10:24:34 +08:00
|
|
|
#include <linux/ioport.h>
|
|
|
|
#include <linux/kexec.h>
|
2015-06-01 19:40:32 +08:00
|
|
|
#include <linux/libfdt.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <linux/mman.h>
|
|
|
|
#include <linux/nodemask.h>
|
|
|
|
#include <linux/memblock.h>
|
2020-03-04 12:28:43 +08:00
|
|
|
#include <linux/memory.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <linux/fs.h>
|
2012-10-23 21:55:08 +08:00
|
|
|
#include <linux/io.h>
|
2017-01-11 05:35:49 +08:00
|
|
|
#include <linux/mm.h>
|
2017-05-15 19:40:20 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2016-01-25 19:44:56 +08:00
|
|
|
#include <asm/barrier.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <asm/cputype.h>
|
2014-11-22 05:50:42 +08:00
|
|
|
#include <asm/fixmap.h>
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
#include <asm/kasan.h>
|
2015-10-19 21:19:28 +08:00
|
|
|
#include <asm/kernel-pgtable.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <asm/sections.h>
|
|
|
|
#include <asm/setup.h>
|
2019-05-15 06:46:51 +08:00
|
|
|
#include <linux/sizes.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
#include <asm/tlb.h>
|
|
|
|
#include <asm/mmu_context.h>
|
2016-10-28 00:27:34 +08:00
|
|
|
#include <asm/ptdump.h>
|
2018-06-06 15:01:21 +08:00
|
|
|
#include <asm/tlbflush.h>
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2017-03-10 04:52:07 +08:00
|
|
|
#define NO_BLOCK_MAPPINGS BIT(0)
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
#define NO_CONT_MAPPINGS BIT(1)
|
2017-03-10 04:52:07 +08:00
|
|
|
|
arm64: mm: increase VA range of identity map
The page size and the number of translation levels, and hence the supported
virtual address range, are build-time configurables on arm64 whose optimal
values are use case dependent. However, in the current implementation, if
the system's RAM is located at a very high offset, the virtual address range
needs to reflect that merely because the identity mapping, which is only used
to enable or disable the MMU, requires the extended virtual range to map the
physical memory at an equal virtual offset.
This patch relaxes that requirement, by increasing the number of translation
levels for the identity mapping only, and only when actually needed, i.e.,
when system RAM's offset is found to be out of reach at runtime.
Tested-by: Laura Abbott <lauraa@codeaurora.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-03-20 00:42:27 +08:00
|
|
|
u64 idmap_t0sz = TCR_T0SZ(VA_BITS);
|
arm64: allow ID map to be extended to 52 bits
Currently, when using VA_BITS < 48, if the ID map text happens to be
placed in physical memory above VA_BITS, we increase the VA size (up to
48) and create a new table level, in order to map in the ID map text.
This is okay because the system always supports 48 bits of VA.
This patch extends the code such that if the system supports 52 bits of
VA, and the ID map text is placed that high up, then we increase the VA
size accordingly, up to 52.
One difference from the current implementation is that so far the
condition of VA_BITS < 48 has meant that the top level table is always
"full", with the maximum number of entries, and an extra table level is
always needed. Now, when VA_BITS = 48 (and using 64k pages), the top
level table is not full, and we simply need to increase the number of
entries in it, instead of creating a new table level.
Tested-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Bob Picco <bob.picco@oracle.com>
Reviewed-by: Bob Picco <bob.picco@oracle.com>
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
[catalin.marinas@arm.com: reduce arguments to __create_hyp_mappings()]
[catalin.marinas@arm.com: reworked/renamed __cpu_uses_extended_idmap_level()]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-12-14 01:07:24 +08:00
|
|
|
u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
|
arm64: mm: increase VA range of identity map
The page size and the number of translation levels, and hence the supported
virtual address range, are build-time configurables on arm64 whose optimal
values are use case dependent. However, in the current implementation, if
the system's RAM is located at a very high offset, the virtual address range
needs to reflect that merely because the identity mapping, which is only used
to enable or disable the MMU, requires the extended virtual range to map the
physical memory at an equal virtual offset.
This patch relaxes that requirement, by increasing the number of translation
levels for the identity mapping only, and only when actually needed, i.e.,
when system RAM's offset is found to be out of reach at runtime.
Tested-by: Laura Abbott <lauraa@codeaurora.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-03-20 00:42:27 +08:00
|
|
|
|
2019-08-07 23:55:18 +08:00
|
|
|
u64 __section(".mmuoff.data.write") vabits_actual;
|
|
|
|
EXPORT_SYMBOL(vabits_actual);
|
arm64: mm: increase VA range of identity map
The page size and the number of translation levels, and hence the supported
virtual address range, are build-time configurables on arm64 whose optimal
values are use case dependent. However, in the current implementation, if
the system's RAM is located at a very high offset, the virtual address range
needs to reflect that merely because the identity mapping, which is only used
to enable or disable the MMU, requires the extended virtual range to map the
physical memory at an equal virtual offset.
This patch relaxes that requirement, by increasing the number of translation
levels for the identity mapping only, and only when actually needed, i.e.,
when system RAM's offset is found to be out of reach at runtime.
Tested-by: Laura Abbott <lauraa@codeaurora.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2015-03-20 00:42:27 +08:00
|
|
|
|
2016-08-15 14:45:46 +08:00
|
|
|
u64 kimage_voffset __ro_after_init;
|
2016-02-16 20:52:42 +08:00
|
|
|
EXPORT_SYMBOL(kimage_voffset);
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
/*
|
|
|
|
* Empty_zero_page is a special page that is used for zero-initialized data
|
|
|
|
* and COW.
|
|
|
|
*/
|
arm64: mm: place empty_zero_page in bss
Currently the zero page is set up in paging_init, and thus we cannot use
the zero page earlier. We use the zero page as a reserved TTBR value
from which no TLB entries may be allocated (e.g. when uninstalling the
idmap). To enable such usage earlier (as may be required for invasive
changes to the kernel page tables), and to minimise the time that the
idmap is active, we need to be able to use the zero page before
paging_init.
This patch follows the example set by x86, by allocating the zero page
at compile time, in .bss. This means that the zero page itself is
available immediately upon entry to start_kernel (as we zero .bss before
this), and also means that the zero page takes up no space in the raw
Image binary. The associated struct page is allocated in bootmem_init,
and remains unavailable until this time.
Outside of arch code, the only users of empty_zero_page assume that the
empty_zero_page symbol refers to the zeroed memory itself, and that
ZERO_PAGE(x) must be used to acquire the associated struct page,
following the example of x86. This patch also brings arm64 inline with
these assumptions.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:44:57 +08:00
|
|
|
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
|
2012-03-05 19:49:27 +08:00
|
|
|
EXPORT_SYMBOL(empty_zero_page);
|
|
|
|
|
2016-02-16 20:52:40 +08:00
|
|
|
static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss;
|
|
|
|
static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
|
|
|
|
static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;
|
|
|
|
|
2018-09-25 00:15:02 +08:00
|
|
|
static DEFINE_SPINLOCK(swapper_pgdir_lock);
|
|
|
|
|
|
|
|
void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
|
|
|
|
{
|
|
|
|
pgd_t *fixmap_pgdp;
|
|
|
|
|
|
|
|
spin_lock(&swapper_pgdir_lock);
|
2018-10-10 22:43:22 +08:00
|
|
|
fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
|
2018-09-25 00:15:02 +08:00
|
|
|
WRITE_ONCE(*fixmap_pgdp, pgd);
|
|
|
|
/*
|
|
|
|
* We need dsb(ishst) here to ensure the page-table-walker sees
|
|
|
|
* our new entry before set_p?d() returns. The fixmap's
|
|
|
|
* flush_tlb_kernel_range() via clear_fixmap() does this for us.
|
|
|
|
*/
|
|
|
|
pgd_clear_fixmap();
|
|
|
|
spin_unlock(&swapper_pgdir_lock);
|
|
|
|
}
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
|
|
|
|
unsigned long size, pgprot_t vma_prot)
|
|
|
|
{
|
|
|
|
if (!pfn_valid(pfn))
|
|
|
|
return pgprot_noncached(vma_prot);
|
|
|
|
else if (file->f_flags & O_SYNC)
|
|
|
|
return pgprot_writecombine(vma_prot);
|
|
|
|
return vma_prot;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(phys_mem_access_prot);
|
|
|
|
|
2019-03-12 08:57:46 +08:00
|
|
|
static phys_addr_t __init early_pgtable_alloc(int shift)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2015-11-21 01:45:40 +08:00
|
|
|
phys_addr_t phys;
|
|
|
|
void *ptr;
|
|
|
|
|
memblock: rename memblock_alloc{_nid,_try_nid} to memblock_phys_alloc*
Make it explicit that the caller gets a physical address rather than a
virtual one.
This will also allow using meblock_alloc prefix for memblock allocations
returning virtual address, which is done in the following patches.
The conversion is done using the following semantic patch:
@@
expression e1, e2, e3;
@@
(
- memblock_alloc(e1, e2)
+ memblock_phys_alloc(e1, e2)
|
- memblock_alloc_nid(e1, e2, e3)
+ memblock_phys_alloc_nid(e1, e2, e3)
|
- memblock_alloc_try_nid(e1, e2, e3)
+ memblock_phys_alloc_try_nid(e1, e2, e3)
)
Link: http://lkml.kernel.org/r/1536927045-23536-7-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Greentime Hu <green.hu@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guan Xuetao <gxt@pku.edu.cn>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rich Felker <dalias@libc.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Serge Semin <fancer.lancer@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-31 06:07:59 +08:00
|
|
|
phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
|
2019-03-12 14:29:26 +08:00
|
|
|
if (!phys)
|
|
|
|
panic("Failed to allocate page table page\n");
|
2016-01-25 19:45:08 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE
|
|
|
|
* slot will be free, so we can (ab)use the FIX_PTE slot to initialise
|
|
|
|
* any level of table.
|
|
|
|
*/
|
|
|
|
ptr = pte_set_fixmap(phys);
|
|
|
|
|
2016-01-25 19:44:56 +08:00
|
|
|
memset(ptr, 0, PAGE_SIZE);
|
|
|
|
|
2016-01-25 19:45:08 +08:00
|
|
|
/*
|
|
|
|
* Implicit barriers also ensure the zeroed page is visible to the page
|
|
|
|
* table walker
|
|
|
|
*/
|
|
|
|
pte_clear_fixmap();
|
|
|
|
|
|
|
|
return phys;
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2016-10-21 19:22:56 +08:00
|
|
|
static bool pgattr_change_is_safe(u64 old, u64 new)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The following mapping attributes may be updated in live
|
|
|
|
* kernel mappings without the need for break-before-make.
|
|
|
|
*/
|
2018-02-24 02:04:48 +08:00
|
|
|
static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG;
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2017-03-10 04:52:06 +08:00
|
|
|
/* creating or taking down mappings is always safe */
|
|
|
|
if (old == 0 || new == 0)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/* live contiguous mappings may not be manipulated at all */
|
|
|
|
if ((old | new) & PTE_CONT)
|
|
|
|
return false;
|
|
|
|
|
2018-02-24 02:04:48 +08:00
|
|
|
/* Transitioning from Non-Global to Global is unsafe */
|
|
|
|
if (old & ~new & PTE_NG)
|
|
|
|
return false;
|
2018-01-29 19:59:54 +08:00
|
|
|
|
2017-03-10 04:52:06 +08:00
|
|
|
return ((old ^ new) & ~mask) == 0;
|
2016-10-21 19:22:56 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
phys_addr_t phys, pgprot_t prot)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pte_t *ptep;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
ptep = pte_set_fixmap_offset(pmdp, addr);
|
2012-03-05 19:49:27 +08:00
|
|
|
do {
|
2018-02-15 19:14:56 +08:00
|
|
|
pte_t old_pte = READ_ONCE(*ptep);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
|
2016-10-21 19:22:56 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* After the PTE entry has been populated once, we
|
|
|
|
* only allow updates to the permission attributes.
|
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
|
|
|
|
READ_ONCE(pte_val(*ptep))));
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2017-03-10 04:52:04 +08:00
|
|
|
phys += PAGE_SIZE;
|
2018-02-15 19:14:56 +08:00
|
|
|
} while (ptep++, addr += PAGE_SIZE, addr != end);
|
2016-01-25 19:45:08 +08:00
|
|
|
|
|
|
|
pte_clear_fixmap();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
unsigned long end, phys_addr_t phys,
|
|
|
|
pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int),
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
int flags)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_t pmd = READ_ONCE(*pmdp);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pmd_sect(pmd));
|
|
|
|
if (pmd_none(pmd)) {
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
phys_addr_t pte_phys;
|
2016-02-06 08:24:46 +08:00
|
|
|
BUG_ON(!pgtable_alloc);
|
2019-03-12 08:57:46 +08:00
|
|
|
pte_phys = pgtable_alloc(PAGE_SHIFT);
|
2018-02-15 19:14:56 +08:00
|
|
|
__pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE);
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pmd_bad(pmd));
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
pgprot_t __prot = prot;
|
|
|
|
|
|
|
|
next = pte_cont_addr_end(addr, end);
|
|
|
|
|
|
|
|
/* use a contiguous mapping if the range is suitably aligned */
|
|
|
|
if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
|
|
|
|
(flags & NO_CONT_MAPPINGS) == 0)
|
|
|
|
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
init_pte(pmdp, addr, next, phys, __prot);
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
phys += next - addr;
|
|
|
|
} while (addr = next, addr != end);
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
phys_addr_t phys, pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int), int flags)
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_t *pmdp;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pmdp = pmd_set_fixmap_offset(pudp, addr);
|
2012-03-05 19:49:27 +08:00
|
|
|
do {
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_t old_pmd = READ_ONCE(*pmdp);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
next = pmd_addr_end(addr, end);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
/* try section mapping first */
|
2016-02-06 08:24:47 +08:00
|
|
|
if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
|
2017-03-10 04:52:07 +08:00
|
|
|
(flags & NO_BLOCK_MAPPINGS) == 0) {
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_set_huge(pmdp, phys, prot);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2014-02-05 00:01:31 +08:00
|
|
|
/*
|
2016-10-21 19:22:56 +08:00
|
|
|
* After the PMD entry has been populated once, we
|
|
|
|
* only allow updates to the permission attributes.
|
2014-02-05 00:01:31 +08:00
|
|
|
*/
|
2016-10-21 19:22:56 +08:00
|
|
|
BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
|
2018-02-15 19:14:56 +08:00
|
|
|
READ_ONCE(pmd_val(*pmdp))));
|
2014-02-05 00:01:31 +08:00
|
|
|
} else {
|
2018-02-15 19:14:56 +08:00
|
|
|
alloc_init_cont_pte(pmdp, addr, next, phys, prot,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
pgtable_alloc, flags);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
|
|
|
BUG_ON(pmd_val(old_pmd) != 0 &&
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
|
2014-02-05 00:01:31 +08:00
|
|
|
}
|
2012-03-05 19:49:27 +08:00
|
|
|
phys += next - addr;
|
2018-02-15 19:14:56 +08:00
|
|
|
} while (pmdp++, addr = next, addr != end);
|
2016-01-25 19:45:08 +08:00
|
|
|
|
|
|
|
pmd_clear_fixmap();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
unsigned long end, phys_addr_t phys,
|
|
|
|
pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int), int flags)
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t pud = READ_ONCE(*pudp);
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check for initial section mappings in the pgd/pud.
|
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pud_sect(pud));
|
|
|
|
if (pud_none(pud)) {
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
phys_addr_t pmd_phys;
|
|
|
|
BUG_ON(!pgtable_alloc);
|
2019-03-12 08:57:46 +08:00
|
|
|
pmd_phys = pgtable_alloc(PMD_SHIFT);
|
2018-02-15 19:14:56 +08:00
|
|
|
__pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE);
|
|
|
|
pud = READ_ONCE(*pudp);
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
}
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pud_bad(pud));
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
pgprot_t __prot = prot;
|
|
|
|
|
|
|
|
next = pmd_cont_addr_end(addr, end);
|
|
|
|
|
|
|
|
/* use a contiguous mapping if the range is suitably aligned */
|
|
|
|
if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
|
|
|
|
(flags & NO_CONT_MAPPINGS) == 0)
|
|
|
|
__prot = __pgprot(pgprot_val(prot) | PTE_CONT);
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags);
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
|
|
|
|
phys += next - addr;
|
|
|
|
} while (addr = next, addr != end);
|
|
|
|
}
|
|
|
|
|
2015-01-22 09:36:06 +08:00
|
|
|
static inline bool use_1G_block(unsigned long addr, unsigned long next,
|
|
|
|
unsigned long phys)
|
|
|
|
{
|
|
|
|
if (PAGE_SHIFT != 12)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (((addr | next | phys) & ~PUD_MASK) != 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
|
|
|
|
phys_addr_t phys, pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int),
|
2018-02-15 19:14:56 +08:00
|
|
|
int flags)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t *pudp;
|
|
|
|
pgd_t pgd = READ_ONCE(*pgdp);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
if (pgd_none(pgd)) {
|
2016-02-06 08:24:46 +08:00
|
|
|
phys_addr_t pud_phys;
|
|
|
|
BUG_ON(!pgtable_alloc);
|
2019-03-12 08:57:46 +08:00
|
|
|
pud_phys = pgtable_alloc(PUD_SHIFT);
|
2018-02-15 19:14:56 +08:00
|
|
|
__pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE);
|
|
|
|
pgd = READ_ONCE(*pgdp);
|
2014-05-12 17:40:51 +08:00
|
|
|
}
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pgd_bad(pgd));
|
2014-05-12 17:40:51 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pudp = pud_set_fixmap_offset(pgdp, addr);
|
2012-03-05 19:49:27 +08:00
|
|
|
do {
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t old_pud = READ_ONCE(*pudp);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
next = pud_addr_end(addr, end);
|
2014-05-06 21:02:27 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For 4K granule only, attempt to put down a 1GB block
|
|
|
|
*/
|
2017-03-10 04:52:07 +08:00
|
|
|
if (use_1G_block(addr, next, phys) &&
|
|
|
|
(flags & NO_BLOCK_MAPPINGS) == 0) {
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_set_huge(pudp, phys, prot);
|
2014-05-06 21:02:27 +08:00
|
|
|
|
|
|
|
/*
|
2016-10-21 19:22:56 +08:00
|
|
|
* After the PUD entry has been populated once, we
|
|
|
|
* only allow updates to the permission attributes.
|
2014-05-06 21:02:27 +08:00
|
|
|
*/
|
2016-10-21 19:22:56 +08:00
|
|
|
BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
|
2018-02-15 19:14:56 +08:00
|
|
|
READ_ONCE(pud_val(*pudp))));
|
2014-05-06 21:02:27 +08:00
|
|
|
} else {
|
2018-02-15 19:14:56 +08:00
|
|
|
alloc_init_cont_pmd(pudp, addr, next, phys, prot,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
pgtable_alloc, flags);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
|
|
|
BUG_ON(pud_val(old_pud) != 0 &&
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
|
2014-05-06 21:02:27 +08:00
|
|
|
}
|
2012-03-05 19:49:27 +08:00
|
|
|
phys += next - addr;
|
2018-02-15 19:14:56 +08:00
|
|
|
} while (pudp++, addr = next, addr != end);
|
2016-01-25 19:45:08 +08:00
|
|
|
|
|
|
|
pud_clear_fixmap();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2016-06-29 20:51:30 +08:00
|
|
|
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
|
|
|
|
unsigned long virt, phys_addr_t size,
|
|
|
|
pgprot_t prot,
|
2019-03-12 08:57:46 +08:00
|
|
|
phys_addr_t (*pgtable_alloc)(int),
|
2017-03-10 04:52:07 +08:00
|
|
|
int flags)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2019-11-03 20:35:58 +08:00
|
|
|
unsigned long addr, end, next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pgd_t *pgdp = pgd_offset_raw(pgdir, virt);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
2015-11-23 21:26:19 +08:00
|
|
|
/*
|
|
|
|
* If the virtual and physical address don't have the same offset
|
|
|
|
* within a page, we cannot map the region as the caller expects.
|
|
|
|
*/
|
|
|
|
if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
|
|
|
|
return;
|
|
|
|
|
2015-11-23 21:26:20 +08:00
|
|
|
phys &= PAGE_MASK;
|
2012-03-05 19:49:27 +08:00
|
|
|
addr = virt & PAGE_MASK;
|
2019-11-03 20:35:58 +08:00
|
|
|
end = PAGE_ALIGN(virt + size);
|
2012-03-05 19:49:27 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
2018-02-15 19:14:56 +08:00
|
|
|
alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc,
|
2017-03-10 04:52:07 +08:00
|
|
|
flags);
|
2012-03-05 19:49:27 +08:00
|
|
|
phys += next - addr;
|
2018-02-15 19:14:56 +08:00
|
|
|
} while (pgdp++, addr = next, addr != end);
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2019-04-08 18:23:48 +08:00
|
|
|
static phys_addr_t __pgd_pgtable_alloc(int shift)
|
2019-03-12 08:57:47 +08:00
|
|
|
{
|
2019-07-12 11:58:02 +08:00
|
|
|
void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
|
2019-03-12 08:57:47 +08:00
|
|
|
BUG_ON(!ptr);
|
|
|
|
|
|
|
|
/* Ensure the zeroed page is visible to the page table walker */
|
|
|
|
dsb(ishst);
|
|
|
|
return __pa(ptr);
|
|
|
|
}
|
|
|
|
|
2019-03-12 08:57:46 +08:00
|
|
|
static phys_addr_t pgd_pgtable_alloc(int shift)
|
2015-01-22 09:36:06 +08:00
|
|
|
{
|
2019-04-08 18:23:48 +08:00
|
|
|
phys_addr_t pa = __pgd_pgtable_alloc(shift);
|
2019-03-12 08:57:46 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Call proper page table ctor in case later we need to
|
|
|
|
* call core mm functions like apply_to_page_range() on
|
|
|
|
* this pre-allocated page table.
|
|
|
|
*
|
|
|
|
* We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
|
|
|
|
* folded, and if so pgtable_pmd_page_ctor() becomes nop.
|
|
|
|
*/
|
|
|
|
if (shift == PAGE_SHIFT)
|
2019-09-26 07:49:46 +08:00
|
|
|
BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa)));
|
2019-03-12 08:57:46 +08:00
|
|
|
else if (shift == PMD_SHIFT)
|
2019-04-08 18:23:48 +08:00
|
|
|
BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa)));
|
2016-01-25 19:44:56 +08:00
|
|
|
|
2019-04-08 18:23:48 +08:00
|
|
|
return pa;
|
2015-01-22 09:36:06 +08:00
|
|
|
}
|
|
|
|
|
2016-02-06 08:24:46 +08:00
|
|
|
/*
|
|
|
|
* This function can only be used to modify existing table entries,
|
|
|
|
* without allocating new levels of table. Note that this permits the
|
|
|
|
* creation of new section or page entries.
|
|
|
|
*/
|
|
|
|
static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
|
2015-01-22 09:36:06 +08:00
|
|
|
phys_addr_t size, pgprot_t prot)
|
2014-03-13 00:28:06 +08:00
|
|
|
{
|
2019-08-14 21:28:48 +08:00
|
|
|
if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
|
2014-03-13 00:28:06 +08:00
|
|
|
pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
|
|
|
|
&phys, virt);
|
|
|
|
return;
|
|
|
|
}
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
|
|
|
|
NO_CONT_MAPPINGS);
|
2014-03-13 00:28:06 +08:00
|
|
|
}
|
|
|
|
|
2014-10-20 21:42:07 +08:00
|
|
|
void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
|
|
|
|
unsigned long virt, phys_addr_t size,
|
2016-10-21 19:22:57 +08:00
|
|
|
pgprot_t prot, bool page_mappings_only)
|
2014-10-20 21:42:07 +08:00
|
|
|
{
|
2017-03-10 04:52:07 +08:00
|
|
|
int flags = 0;
|
|
|
|
|
2016-07-23 01:32:25 +08:00
|
|
|
BUG_ON(mm == &init_mm);
|
|
|
|
|
2017-03-10 04:52:07 +08:00
|
|
|
if (page_mappings_only)
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
2017-03-10 04:52:07 +08:00
|
|
|
|
2016-01-25 19:45:10 +08:00
|
|
|
__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
|
2017-03-10 04:52:07 +08:00
|
|
|
pgd_pgtable_alloc, flags);
|
2014-03-13 00:28:06 +08:00
|
|
|
}
|
|
|
|
|
2017-03-10 04:52:00 +08:00
|
|
|
static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
|
|
|
|
phys_addr_t size, pgprot_t prot)
|
2015-01-22 09:36:06 +08:00
|
|
|
{
|
2019-08-14 21:28:48 +08:00
|
|
|
if ((virt >= PAGE_END) && (virt < VMALLOC_START)) {
|
2017-03-10 04:52:00 +08:00
|
|
|
pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
|
2015-01-22 09:36:06 +08:00
|
|
|
&phys, virt);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
|
|
|
|
NO_CONT_MAPPINGS);
|
2017-03-10 04:52:00 +08:00
|
|
|
|
|
|
|
/* flush the TLBs after updating live kernel mappings */
|
|
|
|
flush_tlb_kernel_range(virt, virt + size);
|
2015-01-22 09:36:06 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
|
2017-04-03 10:24:34 +08:00
|
|
|
phys_addr_t end, pgprot_t prot, int flags)
|
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
__create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
|
2017-04-03 10:24:34 +08:00
|
|
|
prot, early_pgtable_alloc, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init mark_linear_text_alias_ro(void)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Remove the write permissions from the linear alias of .text/.rodata
|
|
|
|
*/
|
|
|
|
update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
|
|
|
|
(unsigned long)__init_begin - (unsigned long)_text,
|
|
|
|
PAGE_KERNEL_RO);
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void __init map_mem(pgd_t *pgdp)
|
2015-01-22 09:36:06 +08:00
|
|
|
{
|
2017-01-13 13:59:35 +08:00
|
|
|
phys_addr_t kernel_start = __pa_symbol(_text);
|
|
|
|
phys_addr_t kernel_end = __pa_symbol(__init_begin);
|
2017-04-03 10:24:34 +08:00
|
|
|
struct memblock_region *reg;
|
2017-03-10 04:52:07 +08:00
|
|
|
int flags = 0;
|
|
|
|
|
arm64: mm: apply r/o permissions of VM areas to its linear alias as well
On arm64, we use block mappings and contiguous hints to map the linear
region, to minimize the TLB footprint. However, this means that the
entire region is mapped using read/write permissions, which we cannot
modify at page granularity without having to take intrusive measures to
prevent TLB conflicts.
This means the linear aliases of pages belonging to read-only mappings
(executable or otherwise) in the vmalloc region are also mapped read/write,
and could potentially be abused to modify things like module code, bpf JIT
code or other read-only data.
So let's fix this, by extending the set_memory_ro/rw routines to take
the linear alias into account. The consequence of enabling this is
that we can no longer use block mappings or contiguous hints, so in
cases where the TLB footprint of the linear region is a bottleneck,
performance may be affected.
Therefore, allow this feature to be runtime en/disabled, by setting
rodata=full (or 'on' to disable just this enhancement, or 'off' to
disable read-only mappings for code and r/o data entirely) on the
kernel command line. Also, allow the default value to be set via a
Kconfig option.
Tested-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2018-11-07 18:36:20 +08:00
|
|
|
if (rodata_full || debug_pagealloc_enabled())
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2015-01-22 09:36:06 +08:00
|
|
|
/*
|
2016-02-16 20:52:40 +08:00
|
|
|
* Take care not to create a writable alias for the
|
|
|
|
* read-only text and rodata sections of the kernel image.
|
2017-04-03 10:24:34 +08:00
|
|
|
* So temporarily mark them as NOMAP to skip mappings in
|
|
|
|
* the following for-loop
|
2015-01-22 09:36:06 +08:00
|
|
|
*/
|
2017-04-03 10:24:34 +08:00
|
|
|
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
|
|
|
|
#ifdef CONFIG_KEXEC_CORE
|
|
|
|
if (crashk_res.end)
|
|
|
|
memblock_mark_nomap(crashk_res.start,
|
|
|
|
resource_size(&crashk_res));
|
|
|
|
#endif
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2017-04-03 10:24:34 +08:00
|
|
|
/* map all the memory banks */
|
|
|
|
for_each_memblock(memory, reg) {
|
|
|
|
phys_addr_t start = reg->base;
|
|
|
|
phys_addr_t end = start + reg->size;
|
2015-01-22 09:36:06 +08:00
|
|
|
|
2017-04-03 10:24:34 +08:00
|
|
|
if (start >= end)
|
|
|
|
break;
|
|
|
|
if (memblock_is_nomap(reg))
|
|
|
|
continue;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
__map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
|
2017-04-03 10:24:34 +08:00
|
|
|
}
|
2016-02-16 20:52:40 +08:00
|
|
|
|
|
|
|
/*
|
2017-03-10 04:52:01 +08:00
|
|
|
* Map the linear alias of the [_text, __init_begin) interval
|
|
|
|
* as non-executable now, and remove the write permission in
|
|
|
|
* mark_linear_text_alias_ro() below (which will be called after
|
|
|
|
* alternative patching has completed). This makes the contents
|
|
|
|
* of the region accessible to subsystems such as hibernate,
|
|
|
|
* but protects it from inadvertent modification or execution.
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
* Note that contiguous mappings cannot be remapped in this way,
|
|
|
|
* so we should avoid them here.
|
2016-02-16 20:52:40 +08:00
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
__map_memblock(pgdp, kernel_start, kernel_end,
|
2017-04-03 10:24:34 +08:00
|
|
|
PAGE_KERNEL, NO_CONT_MAPPINGS);
|
|
|
|
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
|
2015-01-22 09:36:06 +08:00
|
|
|
|
2017-04-03 10:24:34 +08:00
|
|
|
#ifdef CONFIG_KEXEC_CORE
|
2017-03-10 04:52:01 +08:00
|
|
|
/*
|
2017-04-03 10:24:34 +08:00
|
|
|
* Use page-level mappings here so that we can shrink the region
|
|
|
|
* in page granularity and put back unused memory to buddy system
|
|
|
|
* through /sys/kernel/kexec_crash_size interface.
|
2017-03-10 04:52:01 +08:00
|
|
|
*/
|
2017-04-03 10:24:34 +08:00
|
|
|
if (crashk_res.end) {
|
2018-02-15 19:14:56 +08:00
|
|
|
__map_memblock(pgdp, crashk_res.start, crashk_res.end + 1,
|
2017-04-03 10:24:34 +08:00
|
|
|
PAGE_KERNEL,
|
|
|
|
NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
|
|
|
|
memblock_clear_nomap(crashk_res.start,
|
|
|
|
resource_size(&crashk_res));
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
2017-04-03 10:24:34 +08:00
|
|
|
#endif
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
2015-01-22 09:36:06 +08:00
|
|
|
void mark_rodata_ro(void)
|
|
|
|
{
|
2016-02-20 01:50:32 +08:00
|
|
|
unsigned long section_size;
|
2016-02-16 20:52:40 +08:00
|
|
|
|
2016-02-20 01:50:32 +08:00
|
|
|
/*
|
arm64: mm: fix location of _etext
As Kees Cook notes in the ARM counterpart of this patch [0]:
The _etext position is defined to be the end of the kernel text code,
and should not include any part of the data segments. This interferes
with things that might check memory ranges and expect executable code
up to _etext.
In particular, Kees is referring to the HARDENED_USERCOPY patch set [1],
which rejects attempts to call copy_to_user() on kernel ranges containing
executable code, but does allow access to the .rodata segment. Regardless
of whether one may or may not agree with the distinction, it makes sense
for _etext to have the same meaning across architectures.
So let's put _etext where it belongs, between .text and .rodata, and fix
up existing references to use __init_begin instead, which unlike _end_rodata
includes the exception and notes sections as well.
The _etext references in kaslr.c are left untouched, since its references
to [_stext, _etext) are meant to capture potential jump instruction targets,
and so disregarding .rodata is actually an improvement here.
[0] http://article.gmane.org/gmane.linux.kernel/2245084
[1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502
Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-06-23 21:53:17 +08:00
|
|
|
* mark .rodata as read only. Use __init_begin rather than __end_rodata
|
|
|
|
* to cover NOTES and EXCEPTION_TABLE.
|
2016-02-20 01:50:32 +08:00
|
|
|
*/
|
arm64: mm: fix location of _etext
As Kees Cook notes in the ARM counterpart of this patch [0]:
The _etext position is defined to be the end of the kernel text code,
and should not include any part of the data segments. This interferes
with things that might check memory ranges and expect executable code
up to _etext.
In particular, Kees is referring to the HARDENED_USERCOPY patch set [1],
which rejects attempts to call copy_to_user() on kernel ranges containing
executable code, but does allow access to the .rodata segment. Regardless
of whether one may or may not agree with the distinction, it makes sense
for _etext to have the same meaning across architectures.
So let's put _etext where it belongs, between .text and .rodata, and fix
up existing references to use __init_begin instead, which unlike _end_rodata
includes the exception and notes sections as well.
The _etext references in kaslr.c are left untouched, since its references
to [_stext, _etext) are meant to capture potential jump instruction targets,
and so disregarding .rodata is actually an improvement here.
[0] http://article.gmane.org/gmane.linux.kernel/2245084
[1] http://thread.gmane.org/gmane.linux.kernel.hardened.devel/2502
Reported-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-06-23 21:53:17 +08:00
|
|
|
section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
|
2017-03-10 04:52:00 +08:00
|
|
|
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
|
2016-02-20 01:50:32 +08:00
|
|
|
section_size, PAGE_KERNEL_RO);
|
2016-10-21 19:22:56 +08:00
|
|
|
|
2016-10-28 00:27:34 +08:00
|
|
|
debug_checkwx();
|
2015-01-22 09:36:06 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
pgprot_t prot, struct vm_struct *vma,
|
2017-07-24 18:46:09 +08:00
|
|
|
int flags, unsigned long vm_flags)
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
{
|
2017-01-11 05:35:49 +08:00
|
|
|
phys_addr_t pa_start = __pa_symbol(va_start);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
unsigned long size = va_end - va_start;
|
|
|
|
|
|
|
|
BUG_ON(!PAGE_ALIGNED(pa_start));
|
|
|
|
BUG_ON(!PAGE_ALIGNED(size));
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
__create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot,
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
early_pgtable_alloc, flags);
|
2016-02-16 20:52:40 +08:00
|
|
|
|
2017-07-24 18:46:09 +08:00
|
|
|
if (!(vm_flags & VM_NO_GUARD))
|
|
|
|
size += PAGE_SIZE;
|
|
|
|
|
2016-02-16 20:52:40 +08:00
|
|
|
vma->addr = va_start;
|
|
|
|
vma->phys_addr = pa_start;
|
|
|
|
vma->size = size;
|
2017-07-24 18:46:09 +08:00
|
|
|
vma->flags = VM_MAP | vm_flags;
|
2016-02-16 20:52:40 +08:00
|
|
|
vma->caller = __builtin_return_address(0);
|
|
|
|
|
|
|
|
vm_area_add_early(vma);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
}
|
|
|
|
|
2017-03-10 04:52:02 +08:00
|
|
|
static int __init parse_rodata(char *arg)
|
|
|
|
{
|
arm64: mm: apply r/o permissions of VM areas to its linear alias as well
On arm64, we use block mappings and contiguous hints to map the linear
region, to minimize the TLB footprint. However, this means that the
entire region is mapped using read/write permissions, which we cannot
modify at page granularity without having to take intrusive measures to
prevent TLB conflicts.
This means the linear aliases of pages belonging to read-only mappings
(executable or otherwise) in the vmalloc region are also mapped read/write,
and could potentially be abused to modify things like module code, bpf JIT
code or other read-only data.
So let's fix this, by extending the set_memory_ro/rw routines to take
the linear alias into account. The consequence of enabling this is
that we can no longer use block mappings or contiguous hints, so in
cases where the TLB footprint of the linear region is a bottleneck,
performance may be affected.
Therefore, allow this feature to be runtime en/disabled, by setting
rodata=full (or 'on' to disable just this enhancement, or 'off' to
disable read-only mappings for code and r/o data entirely) on the
kernel command line. Also, allow the default value to be set via a
Kconfig option.
Tested-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2018-11-07 18:36:20 +08:00
|
|
|
int ret = strtobool(arg, &rodata_enabled);
|
|
|
|
if (!ret) {
|
|
|
|
rodata_full = false;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* permit 'full' in addition to boolean options */
|
|
|
|
if (strcmp(arg, "full"))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
rodata_enabled = true;
|
|
|
|
rodata_full = true;
|
|
|
|
return 0;
|
2017-03-10 04:52:02 +08:00
|
|
|
}
|
|
|
|
early_param("rodata", parse_rodata);
|
|
|
|
|
2017-11-14 22:14:17 +08:00
|
|
|
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
|
|
|
|
static int __init map_entry_trampoline(void)
|
|
|
|
{
|
|
|
|
pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
|
|
|
|
phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);
|
|
|
|
|
|
|
|
/* The trampoline is always mapped and can therefore be global */
|
|
|
|
pgprot_val(prot) &= ~PTE_NG;
|
|
|
|
|
|
|
|
/* Map only the text into the trampoline page table */
|
|
|
|
memset(tramp_pg_dir, 0, PGD_SIZE);
|
|
|
|
__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE,
|
2019-04-08 18:23:48 +08:00
|
|
|
prot, __pgd_pgtable_alloc, 0);
|
2017-11-14 22:14:17 +08:00
|
|
|
|
2017-12-06 19:24:02 +08:00
|
|
|
/* Map both the text and data into the kernel page table */
|
2017-11-14 22:14:17 +08:00
|
|
|
__set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot);
|
2017-12-06 19:24:02 +08:00
|
|
|
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
|
|
|
|
extern char __entry_tramp_data_start[];
|
|
|
|
|
|
|
|
__set_fixmap(FIX_ENTRY_TRAMP_DATA,
|
|
|
|
__pa_symbol(__entry_tramp_data_start),
|
|
|
|
PAGE_KERNEL_RO);
|
|
|
|
}
|
|
|
|
|
2017-11-14 22:14:17 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
core_initcall(map_entry_trampoline);
|
|
|
|
#endif
|
|
|
|
|
2020-05-07 03:51:31 +08:00
|
|
|
/*
|
|
|
|
* Open coded check for BTI, only for use to determine configuration
|
|
|
|
* for early mappings for before the cpufeature code has run.
|
|
|
|
*/
|
|
|
|
static bool arm64_early_this_cpu_has_bti(void)
|
|
|
|
{
|
|
|
|
u64 pfr1;
|
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
pfr1 = read_sysreg_s(SYS_ID_AA64PFR1_EL1);
|
|
|
|
return cpuid_feature_extract_unsigned_field(pfr1,
|
|
|
|
ID_AA64PFR1_BT_SHIFT);
|
|
|
|
}
|
|
|
|
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
/*
|
|
|
|
* Create fine-grained mappings for the kernel.
|
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
static void __init map_kernel(pgd_t *pgdp)
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
{
|
2017-03-10 04:52:03 +08:00
|
|
|
static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext,
|
|
|
|
vmlinux_initdata, vmlinux_data;
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2017-03-10 04:52:02 +08:00
|
|
|
/*
|
|
|
|
* External debuggers may need to write directly to the text
|
|
|
|
* mapping to install SW breakpoints. Allow this (only) when
|
|
|
|
* explicitly requested with rodata=off.
|
|
|
|
*/
|
|
|
|
pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
|
|
|
|
|
2020-05-07 03:51:31 +08:00
|
|
|
/*
|
|
|
|
* If we have a CPU that supports BTI and a kernel built for
|
|
|
|
* BTI then mark the kernel executable text as guarded pages
|
|
|
|
* now so we don't have to rewrite the page tables later.
|
|
|
|
*/
|
|
|
|
if (arm64_early_this_cpu_has_bti())
|
|
|
|
text_prot = __pgprot_modify(text_prot, PTE_GP, PTE_GP);
|
|
|
|
|
arm64: mm: set the contiguous bit for kernel mappings where appropriate
This is the third attempt at enabling the use of contiguous hints for
kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
it turned out that updating permission attributes on live contiguous ranges
may result in TLB conflicts. So this time, the contiguous hint is not set
for .rodata or for the linear alias of .text/.rodata, both of which are
mapped read-write initially, and remapped read-only at a later stage.
(Note that the latter region could also be unmapped and remapped again
with updated permission attributes, given that the region, while live, is
only mapped for the convenience of the hibernation code, but that also
means the TLB footprint is negligible anyway, so why bother)
This enables the following contiguous range sizes for the virtual mapping
of the kernel image, and for the linear mapping:
granule size | cont PTE | cont PMD |
-------------+------------+------------+
4 KB | 64 KB | 32 MB |
16 KB | 2 MB | 1 GB* |
64 KB | 2 MB | 16 GB* |
* Only when built for 3 or more levels of translation. This is due to the
fact that a 2 level configuration only consists of PGDs and PTEs, and the
added complexity of dealing with folded PMDs is not justified considering
that 16 GB contiguous ranges are likely to be ignored by the hardware (and
16k/2 levels is a niche configuration)
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2017-03-10 04:52:09 +08:00
|
|
|
/*
|
|
|
|
* Only rodata will be remapped with different permissions later on,
|
|
|
|
* all other segments are allowed to use contiguous mappings.
|
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0,
|
2017-07-24 18:46:09 +08:00
|
|
|
VM_NO_GUARD);
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL,
|
2017-07-24 18:46:09 +08:00
|
|
|
&vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD);
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot,
|
2017-07-24 18:46:09 +08:00
|
|
|
&vmlinux_inittext, 0, VM_NO_GUARD);
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL,
|
2017-07-24 18:46:09 +08:00
|
|
|
&vmlinux_initdata, 0, VM_NO_GUARD);
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
if (!READ_ONCE(pgd_val(*pgd_offset_raw(pgdp, FIXADDR_START)))) {
|
2016-02-16 20:52:40 +08:00
|
|
|
/*
|
|
|
|
* The fixmap falls in a separate pgd to the kernel, and doesn't
|
|
|
|
* live in the carveout for the swapper_pg_dir. We can simply
|
|
|
|
* re-use the existing dir for the fixmap.
|
|
|
|
*/
|
2018-02-15 19:14:56 +08:00
|
|
|
set_pgd(pgd_offset_raw(pgdp, FIXADDR_START),
|
|
|
|
READ_ONCE(*pgd_offset_k(FIXADDR_START)));
|
2016-02-16 20:52:40 +08:00
|
|
|
} else if (CONFIG_PGTABLE_LEVELS > 3) {
|
2019-08-27 23:57:08 +08:00
|
|
|
pgd_t *bm_pgdp;
|
|
|
|
pud_t *bm_pudp;
|
2016-02-16 20:52:40 +08:00
|
|
|
/*
|
|
|
|
* The fixmap shares its top level pgd entry with the kernel
|
|
|
|
* mapping. This can really only occur when we are running
|
|
|
|
* with 16k/4 levels, so we can simply reuse the pud level
|
|
|
|
* entry instead.
|
|
|
|
*/
|
|
|
|
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
|
2019-08-27 23:57:08 +08:00
|
|
|
bm_pgdp = pgd_offset_raw(pgdp, FIXADDR_START);
|
|
|
|
bm_pudp = pud_set_fixmap_offset(bm_pgdp, FIXADDR_START);
|
|
|
|
pud_populate(&init_mm, bm_pudp, lm_alias(bm_pmd));
|
2016-02-16 20:52:40 +08:00
|
|
|
pud_clear_fixmap();
|
|
|
|
} else {
|
|
|
|
BUG();
|
|
|
|
}
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
kasan_copy_shadow(pgdp);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
}
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
void __init paging_init(void)
|
|
|
|
{
|
2018-09-25 00:15:02 +08:00
|
|
|
pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
map_kernel(pgdp);
|
|
|
|
map_mem(pgdp);
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
|
|
|
pgd_clear_fixmap();
|
|
|
|
|
2017-01-11 05:35:49 +08:00
|
|
|
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
|
arm64/mm: Separate boot-time page tables from swapper_pg_dir
Since the address of swapper_pg_dir is fixed for a given kernel image,
it is an attractive target for manipulation via an arbitrary write. To
mitigate this we'd like to make it read-only by moving it into the
rodata section.
We require that swapper_pg_dir is at a fixed offset from tramp_pg_dir
and reserved_ttbr0, so these will also need to move into rodata.
However, swapper_pg_dir is allocated along with some transient page
tables used for boot which we do not want to move into rodata.
As a step towards this, this patch separates the boot-time page tables
into a new init_pg_dir, and reduces swapper_pg_dir to the single page it
needs to be. This allows us to retain the relationship between
swapper_pg_dir, tramp_pg_dir, and swapper_pg_dir, while cleanly
separating these from the boot-time page tables.
The init_pg_dir holds all of the pgd/pud/pmd/pte levels needed during
boot, and all of these levels will be freed when we switch to the
swapper_pg_dir, which is initialized by the existing code in
paging_init(). Since we start off on the init_pg_dir, we no longer need
to allocate a transient page table in paging_init() in order to ensure
that swapper_pg_dir isn't live while we initialize it.
There should be no functional change as a result of this patch.
Signed-off-by: Jun Yao <yaojun8558363@gmail.com>
Reviewed-by: James Morse <james.morse@arm.com>
[Mark: place init_pg_dir after BSS, fold mm changes, commit message]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-09-24 22:47:49 +08:00
|
|
|
init_mm.pgd = swapper_pg_dir;
|
arm64: mm: create new fine-grained mappings at boot
At boot we may change the granularity of the tables mapping the kernel
(by splitting or making sections). This may happen when we create the
linear mapping (in __map_memblock), or at any point we try to apply
fine-grained permissions to the kernel (e.g. fixup_executable,
mark_rodata_ro, fixup_init).
Changing the active page tables in this manner may result in multiple
entries for the same address being allocated into TLBs, risking problems
such as TLB conflict aborts or issues derived from the amalgamation of
TLB entries. Generally, a break-before-make (BBM) approach is necessary
to avoid conflicts, but we cannot do this for the kernel tables as it
risks unmapping text or data being used to do so.
Instead, we can create a new set of tables from scratch in the safety of
the existing mappings, and subsequently migrate over to these using the
new cpu_replace_ttbr1 helper, which avoids the two sets of tables being
active simultaneously.
To avoid issues when we later modify permissions of the page tables
(e.g. in fixup_init), we must create the page tables at a granularity
such that later modification does not result in splitting of tables.
This patch applies this strategy, creating a new set of fine-grained
page tables from scratch, and safely migrating to them. The existing
fixmap and kasan shadow page tables are reused in the new fine-grained
tables.
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-25 19:45:12 +08:00
|
|
|
|
arm64/mm: Separate boot-time page tables from swapper_pg_dir
Since the address of swapper_pg_dir is fixed for a given kernel image,
it is an attractive target for manipulation via an arbitrary write. To
mitigate this we'd like to make it read-only by moving it into the
rodata section.
We require that swapper_pg_dir is at a fixed offset from tramp_pg_dir
and reserved_ttbr0, so these will also need to move into rodata.
However, swapper_pg_dir is allocated along with some transient page
tables used for boot which we do not want to move into rodata.
As a step towards this, this patch separates the boot-time page tables
into a new init_pg_dir, and reduces swapper_pg_dir to the single page it
needs to be. This allows us to retain the relationship between
swapper_pg_dir, tramp_pg_dir, and swapper_pg_dir, while cleanly
separating these from the boot-time page tables.
The init_pg_dir holds all of the pgd/pud/pmd/pte levels needed during
boot, and all of these levels will be freed when we switch to the
swapper_pg_dir, which is initialized by the existing code in
paging_init(). Since we start off on the init_pg_dir, we no longer need
to allocate a transient page table in paging_init() in order to ensure
that swapper_pg_dir isn't live while we initialize it.
There should be no functional change as a result of this patch.
Signed-off-by: Jun Yao <yaojun8558363@gmail.com>
Reviewed-by: James Morse <james.morse@arm.com>
[Mark: place init_pg_dir after BSS, fold mm changes, commit message]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2018-09-24 22:47:49 +08:00
|
|
|
memblock_free(__pa_symbol(init_pg_dir),
|
|
|
|
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
|
2018-11-07 22:16:06 +08:00
|
|
|
|
|
|
|
memblock_allow_resize();
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether a kernel address is valid (derived from arch/x86/).
|
|
|
|
*/
|
|
|
|
int kern_addr_valid(unsigned long addr)
|
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pgd_t *pgdp;
|
|
|
|
pud_t *pudp, pud;
|
|
|
|
pmd_t *pmdp, pmd;
|
|
|
|
pte_t *ptep, pte;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
|
|
|
if ((((long)addr) >> VA_BITS) != -1UL)
|
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pgdp = pgd_offset_k(addr);
|
|
|
|
if (pgd_none(READ_ONCE(*pgdp)))
|
2012-03-05 19:49:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pudp = pud_offset(pgdp, addr);
|
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
if (pud_none(pud))
|
2012-03-05 19:49:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
if (pud_sect(pud))
|
|
|
|
return pfn_valid(pud_pfn(pud));
|
2014-05-06 21:02:27 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pmdp = pmd_offset(pudp, addr);
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
if (pmd_none(pmd))
|
2012-03-05 19:49:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
if (pmd_sect(pmd))
|
|
|
|
return pfn_valid(pmd_pfn(pmd));
|
2014-04-16 01:53:24 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
|
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
if (pte_none(pte))
|
2012-03-05 19:49:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
return pfn_valid(pte_pfn(pte));
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
2020-03-04 12:28:43 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
|
|
static void free_hotplug_page_range(struct page *page, size_t size)
|
|
|
|
{
|
|
|
|
WARN_ON(PageReserved(page));
|
|
|
|
free_pages((unsigned long)page_address(page), get_order(size));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_hotplug_pgtable_page(struct page *page)
|
|
|
|
{
|
|
|
|
free_hotplug_page_range(page, PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool pgtable_range_aligned(unsigned long start, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling,
|
|
|
|
unsigned long mask)
|
|
|
|
{
|
|
|
|
start &= mask;
|
|
|
|
if (start < floor)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (ceiling) {
|
|
|
|
ceiling &= mask;
|
|
|
|
if (!ceiling)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (end - 1 > ceiling - 1)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
|
|
|
|
unsigned long end, bool free_mapped)
|
|
|
|
{
|
|
|
|
pte_t *ptep, pte;
|
|
|
|
|
|
|
|
do {
|
|
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
|
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
if (pte_none(pte))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pte_present(pte));
|
|
|
|
pte_clear(&init_mm, addr, ptep);
|
|
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
|
|
if (free_mapped)
|
|
|
|
free_hotplug_page_range(pte_page(pte), PAGE_SIZE);
|
|
|
|
} while (addr += PAGE_SIZE, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
|
|
|
|
unsigned long end, bool free_mapped)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pmd_t *pmdp, pmd;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
pmdp = pmd_offset(pudp, addr);
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
if (pmd_none(pmd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pmd_present(pmd));
|
|
|
|
if (pmd_sect(pmd)) {
|
|
|
|
pmd_clear(pmdp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* One TLBI should be sufficient here as the PMD_SIZE
|
|
|
|
* range is mapped with a single block entry.
|
|
|
|
*/
|
|
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
|
|
if (free_mapped)
|
|
|
|
free_hotplug_page_range(pmd_page(pmd),
|
|
|
|
PMD_SIZE);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
WARN_ON(!pmd_table(pmd));
|
|
|
|
unmap_hotplug_pte_range(pmdp, addr, next, free_mapped);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
|
|
|
|
unsigned long end, bool free_mapped)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pud_t *pudp, pud;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
pudp = pud_offset(p4dp, addr);
|
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
if (pud_none(pud))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pud_present(pud));
|
|
|
|
if (pud_sect(pud)) {
|
|
|
|
pud_clear(pudp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* One TLBI should be sufficient here as the PUD_SIZE
|
|
|
|
* range is mapped with a single block entry.
|
|
|
|
*/
|
|
|
|
flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
|
|
|
|
if (free_mapped)
|
|
|
|
free_hotplug_page_range(pud_page(pud),
|
|
|
|
PUD_SIZE);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
WARN_ON(!pud_table(pud));
|
|
|
|
unmap_hotplug_pmd_range(pudp, addr, next, free_mapped);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
|
|
|
|
unsigned long end, bool free_mapped)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
p4d_t *p4dp, p4d;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = p4d_addr_end(addr, end);
|
|
|
|
p4dp = p4d_offset(pgdp, addr);
|
|
|
|
p4d = READ_ONCE(*p4dp);
|
|
|
|
if (p4d_none(p4d))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!p4d_present(p4d));
|
|
|
|
unmap_hotplug_pud_range(p4dp, addr, next, free_mapped);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void unmap_hotplug_range(unsigned long addr, unsigned long end,
|
|
|
|
bool free_mapped)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pgd_t *pgdp, pgd;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
pgdp = pgd_offset_k(addr);
|
|
|
|
pgd = READ_ONCE(*pgdp);
|
|
|
|
if (pgd_none(pgd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pgd_present(pgd));
|
|
|
|
unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling)
|
|
|
|
{
|
|
|
|
pte_t *ptep, pte;
|
|
|
|
unsigned long i, start = addr;
|
|
|
|
|
|
|
|
do {
|
|
|
|
ptep = pte_offset_kernel(pmdp, addr);
|
|
|
|
pte = READ_ONCE(*ptep);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is just a sanity check here which verifies that
|
|
|
|
* pte clearing has been done by earlier unmap loops.
|
|
|
|
*/
|
|
|
|
WARN_ON(!pte_none(pte));
|
|
|
|
} while (addr += PAGE_SIZE, addr < end);
|
|
|
|
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether we can free the pte page if the rest of the
|
|
|
|
* entries are empty. Overlap with other regions have been
|
|
|
|
* handled by the floor/ceiling check.
|
|
|
|
*/
|
|
|
|
ptep = pte_offset_kernel(pmdp, 0UL);
|
|
|
|
for (i = 0; i < PTRS_PER_PTE; i++) {
|
|
|
|
if (!pte_none(READ_ONCE(ptep[i])))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pmd_clear(pmdp);
|
|
|
|
__flush_tlb_kernel_pgtable(start);
|
|
|
|
free_hotplug_pgtable_page(virt_to_page(ptep));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling)
|
|
|
|
{
|
|
|
|
pmd_t *pmdp, pmd;
|
|
|
|
unsigned long i, next, start = addr;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
pmdp = pmd_offset(pudp, addr);
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
if (pmd_none(pmd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
|
|
|
|
free_empty_pte_table(pmdp, addr, next, floor, ceiling);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
|
|
|
|
if (CONFIG_PGTABLE_LEVELS <= 2)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether we can free the pmd page if the rest of the
|
|
|
|
* entries are empty. Overlap with other regions have been
|
|
|
|
* handled by the floor/ceiling check.
|
|
|
|
*/
|
|
|
|
pmdp = pmd_offset(pudp, 0UL);
|
|
|
|
for (i = 0; i < PTRS_PER_PMD; i++) {
|
|
|
|
if (!pmd_none(READ_ONCE(pmdp[i])))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pud_clear(pudp);
|
|
|
|
__flush_tlb_kernel_pgtable(start);
|
|
|
|
free_hotplug_pgtable_page(virt_to_page(pmdp));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling)
|
|
|
|
{
|
|
|
|
pud_t *pudp, pud;
|
|
|
|
unsigned long i, next, start = addr;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
|
|
|
pudp = pud_offset(p4dp, addr);
|
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
if (pud_none(pud))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
|
|
|
|
free_empty_pmd_table(pudp, addr, next, floor, ceiling);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
|
|
|
|
if (CONFIG_PGTABLE_LEVELS <= 3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether we can free the pud page if the rest of the
|
|
|
|
* entries are empty. Overlap with other regions have been
|
|
|
|
* handled by the floor/ceiling check.
|
|
|
|
*/
|
|
|
|
pudp = pud_offset(p4dp, 0UL);
|
|
|
|
for (i = 0; i < PTRS_PER_PUD; i++) {
|
|
|
|
if (!pud_none(READ_ONCE(pudp[i])))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
p4d_clear(p4dp);
|
|
|
|
__flush_tlb_kernel_pgtable(start);
|
|
|
|
free_hotplug_pgtable_page(virt_to_page(pudp));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
|
|
|
|
unsigned long end, unsigned long floor,
|
|
|
|
unsigned long ceiling)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
p4d_t *p4dp, p4d;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = p4d_addr_end(addr, end);
|
|
|
|
p4dp = p4d_offset(pgdp, addr);
|
|
|
|
p4d = READ_ONCE(*p4dp);
|
|
|
|
if (p4d_none(p4d))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!p4d_present(p4d));
|
|
|
|
free_empty_pud_table(p4dp, addr, next, floor, ceiling);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_empty_tables(unsigned long addr, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
|
|
|
{
|
|
|
|
unsigned long next;
|
|
|
|
pgd_t *pgdp, pgd;
|
|
|
|
|
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
|
|
|
pgdp = pgd_offset_k(addr);
|
|
|
|
pgd = READ_ONCE(*pgdp);
|
|
|
|
if (pgd_none(pgd))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
WARN_ON(!pgd_present(pgd));
|
|
|
|
free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
|
|
|
|
} while (addr = next, addr < end);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-03-05 19:49:27 +08:00
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
2015-10-19 21:19:28 +08:00
|
|
|
#if !ARM64_SWAPPER_USES_SECTION_MAPS
|
2017-12-29 15:53:54 +08:00
|
|
|
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
|
|
|
struct vmem_altmap *altmap)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2013-04-30 06:07:50 +08:00
|
|
|
return vmemmap_populate_basepages(start, end, node);
|
2012-03-05 19:49:27 +08:00
|
|
|
}
|
2015-10-19 21:19:28 +08:00
|
|
|
#else /* !ARM64_SWAPPER_USES_SECTION_MAPS */
|
2017-12-29 15:53:54 +08:00
|
|
|
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
|
|
|
struct vmem_altmap *altmap)
|
2012-03-05 19:49:27 +08:00
|
|
|
{
|
2013-04-30 06:07:50 +08:00
|
|
|
unsigned long addr = start;
|
2012-03-05 19:49:27 +08:00
|
|
|
unsigned long next;
|
2018-02-15 19:14:56 +08:00
|
|
|
pgd_t *pgdp;
|
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
2012-03-05 19:49:27 +08:00
|
|
|
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pgdp = vmemmap_pgd_populate(addr, node);
|
|
|
|
if (!pgdp)
|
2012-03-05 19:49:27 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pudp = vmemmap_pud_populate(pgdp, addr, node);
|
|
|
|
if (!pudp)
|
2012-03-05 19:49:27 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pmdp = pmd_offset(pudp, addr);
|
|
|
|
if (pmd_none(READ_ONCE(*pmdp))) {
|
2012-03-05 19:49:27 +08:00
|
|
|
void *p = NULL;
|
|
|
|
|
|
|
|
p = vmemmap_alloc_block_buf(PMD_SIZE, node);
|
|
|
|
if (!p)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
|
2012-03-05 19:49:27 +08:00
|
|
|
} else
|
2018-02-15 19:14:56 +08:00
|
|
|
vmemmap_verify((pte_t *)pmdp, node, addr, next);
|
2012-03-05 19:49:27 +08:00
|
|
|
} while (addr = next, addr != end);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2019-06-07 07:49:10 +08:00
|
|
|
#endif /* !ARM64_SWAPPER_USES_SECTION_MAPS */
|
2017-12-29 15:53:56 +08:00
|
|
|
void vmemmap_free(unsigned long start, unsigned long end,
|
|
|
|
struct vmem_altmap *altmap)
|
2013-02-23 08:33:08 +08:00
|
|
|
{
|
2020-03-04 12:28:43 +08:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
|
|
WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
|
|
|
|
|
|
|
|
unmap_hotplug_range(start, end, true);
|
|
|
|
free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
|
|
|
|
#endif
|
2013-02-23 08:33:08 +08:00
|
|
|
}
|
2012-03-05 19:49:27 +08:00
|
|
|
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
2014-11-22 05:50:42 +08:00
|
|
|
|
|
|
|
static inline pud_t * fixmap_pud(unsigned long addr)
|
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pgd_t *pgdp = pgd_offset_k(addr);
|
|
|
|
pgd_t pgd = READ_ONCE(*pgdp);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pgd_none(pgd) || pgd_bad(pgd));
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
return pud_offset_kimg(pgdp, addr);
|
2014-11-22 05:50:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pmd_t * fixmap_pmd(unsigned long addr)
|
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_t *pudp = fixmap_pud(addr);
|
|
|
|
pud_t pud = READ_ONCE(*pudp);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
BUG_ON(pud_none(pud) || pud_bad(pud));
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
return pmd_offset_kimg(pudp, addr);
|
2014-11-22 05:50:42 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pte_t * fixmap_pte(unsigned long addr)
|
|
|
|
{
|
2016-02-16 20:52:38 +08:00
|
|
|
return &bm_pte[pte_index(addr)];
|
2014-11-22 05:50:42 +08:00
|
|
|
}
|
|
|
|
|
2017-01-11 05:35:49 +08:00
|
|
|
/*
|
|
|
|
* The p*d_populate functions call virt_to_phys implicitly so they can't be used
|
|
|
|
* directly on kernel symbols (bm_p*d). This function is called too early to use
|
|
|
|
* lm_alias so __p*d_populate functions must be used to populate with the
|
|
|
|
* physical address from __pa_symbol.
|
|
|
|
*/
|
2014-11-22 05:50:42 +08:00
|
|
|
void __init early_fixmap_init(void)
|
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
pgd_t *pgdp, pgd;
|
|
|
|
pud_t *pudp;
|
|
|
|
pmd_t *pmdp;
|
2014-11-22 05:50:42 +08:00
|
|
|
unsigned long addr = FIXADDR_START;
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
pgdp = pgd_offset_k(addr);
|
|
|
|
pgd = READ_ONCE(*pgdp);
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
if (CONFIG_PGTABLE_LEVELS > 3 &&
|
2018-02-15 19:14:56 +08:00
|
|
|
!(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) {
|
2016-02-16 20:52:40 +08:00
|
|
|
/*
|
|
|
|
* We only end up here if the kernel mapping and the fixmap
|
|
|
|
* share the top level pgd entry, which should only happen on
|
|
|
|
* 16k/4 levels configurations.
|
|
|
|
*/
|
|
|
|
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
|
2018-02-15 19:14:56 +08:00
|
|
|
pudp = pud_offset_kimg(pgdp, addr);
|
2016-02-16 20:52:40 +08:00
|
|
|
} else {
|
2018-02-15 19:14:56 +08:00
|
|
|
if (pgd_none(pgd))
|
|
|
|
__pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
|
|
|
|
pudp = fixmap_pud(addr);
|
2016-02-16 20:52:40 +08:00
|
|
|
}
|
2018-02-15 19:14:56 +08:00
|
|
|
if (pud_none(READ_ONCE(*pudp)))
|
|
|
|
__pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
|
|
|
|
pmdp = fixmap_pmd(addr);
|
|
|
|
__pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The boot-ioremap range spans multiple pmds, for which
|
2016-02-16 20:52:38 +08:00
|
|
|
* we are not prepared:
|
2014-11-22 05:50:42 +08:00
|
|
|
*/
|
|
|
|
BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
|
|
|
|
!= (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
|
|
|
|
|| pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
|
2014-11-22 05:50:42 +08:00
|
|
|
WARN_ON(1);
|
2018-02-15 19:14:56 +08:00
|
|
|
pr_warn("pmdp %p != %p, %p\n",
|
|
|
|
pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
|
2014-11-22 05:50:42 +08:00
|
|
|
fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
|
|
|
|
pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
|
|
|
|
fix_to_virt(FIX_BTMAP_BEGIN));
|
|
|
|
pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n",
|
|
|
|
fix_to_virt(FIX_BTMAP_END));
|
|
|
|
|
|
|
|
pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
|
|
|
|
pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-07 02:44:26 +08:00
|
|
|
/*
|
|
|
|
* Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we
|
|
|
|
* ever need to use IPIs for TLB broadcasting, then we're in trouble here.
|
|
|
|
*/
|
2014-11-22 05:50:42 +08:00
|
|
|
void __set_fixmap(enum fixed_addresses idx,
|
|
|
|
phys_addr_t phys, pgprot_t flags)
|
|
|
|
{
|
|
|
|
unsigned long addr = __fix_to_virt(idx);
|
2018-02-15 19:14:56 +08:00
|
|
|
pte_t *ptep;
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2015-03-04 21:27:35 +08:00
|
|
|
BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
ptep = fixmap_pte(addr);
|
2014-11-22 05:50:42 +08:00
|
|
|
|
|
|
|
if (pgprot_val(flags)) {
|
2018-02-15 19:14:56 +08:00
|
|
|
set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
|
2014-11-22 05:50:42 +08:00
|
|
|
} else {
|
2018-02-15 19:14:56 +08:00
|
|
|
pte_clear(&init_mm, addr, ptep);
|
2014-11-22 05:50:42 +08:00
|
|
|
flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
|
|
|
|
}
|
|
|
|
}
|
2015-06-01 19:40:32 +08:00
|
|
|
|
2019-08-23 14:24:50 +08:00
|
|
|
void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
|
2015-06-01 19:40:32 +08:00
|
|
|
{
|
|
|
|
const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
int offset;
|
2015-06-01 19:40:32 +08:00
|
|
|
void *dt_virt;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check whether the physical FDT address is set and meets the minimum
|
|
|
|
* alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
|
2016-08-01 19:29:31 +08:00
|
|
|
* at least 8 bytes so that we can always access the magic and size
|
|
|
|
* fields of the FDT header after mapping the first chunk, double check
|
|
|
|
* here if that is indeed the case.
|
2015-06-01 19:40:32 +08:00
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
|
|
|
|
if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure that the FDT region can be mapped without the need to
|
|
|
|
* allocate additional translation table pages, so that it is safe
|
2016-02-06 08:24:46 +08:00
|
|
|
* to call create_mapping_noalloc() this early.
|
2015-06-01 19:40:32 +08:00
|
|
|
*
|
|
|
|
* On 64k pages, the FDT will be mapped using PTEs, so we need to
|
|
|
|
* be in the same PMD as the rest of the fixmap.
|
|
|
|
* On 4k pages, we'll use section mappings for the FDT so we only
|
|
|
|
* have to be in the same PUD.
|
|
|
|
*/
|
|
|
|
BUILD_BUG_ON(dt_virt_base % SZ_2M);
|
|
|
|
|
2015-10-19 21:19:28 +08:00
|
|
|
BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> SWAPPER_TABLE_SHIFT !=
|
|
|
|
__fix_to_virt(FIX_BTMAP_BEGIN) >> SWAPPER_TABLE_SHIFT);
|
2015-06-01 19:40:32 +08:00
|
|
|
|
2015-10-19 21:19:28 +08:00
|
|
|
offset = dt_phys % SWAPPER_BLOCK_SIZE;
|
2015-06-01 19:40:32 +08:00
|
|
|
dt_virt = (void *)dt_virt_base + offset;
|
|
|
|
|
|
|
|
/* map the first chunk so we can read the size from the header */
|
2016-02-06 08:24:46 +08:00
|
|
|
create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE),
|
|
|
|
dt_virt_base, SWAPPER_BLOCK_SIZE, prot);
|
2015-06-01 19:40:32 +08:00
|
|
|
|
2016-08-01 19:29:31 +08:00
|
|
|
if (fdt_magic(dt_virt) != FDT_MAGIC)
|
2015-06-01 19:40:32 +08:00
|
|
|
return NULL;
|
|
|
|
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
*size = fdt_totalsize(dt_virt);
|
|
|
|
if (*size > MAX_FDT_SIZE)
|
2015-06-01 19:40:32 +08:00
|
|
|
return NULL;
|
|
|
|
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
if (offset + *size > SWAPPER_BLOCK_SIZE)
|
2016-02-06 08:24:46 +08:00
|
|
|
create_mapping_noalloc(round_down(dt_phys, SWAPPER_BLOCK_SIZE), dt_virt_base,
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
round_up(offset + *size, SWAPPER_BLOCK_SIZE), prot);
|
2015-06-01 19:40:32 +08:00
|
|
|
|
arm64: add support for kernel ASLR
This adds support for KASLR is implemented, based on entropy provided by
the bootloader in the /chosen/kaslr-seed DT property. Depending on the size
of the address space (VA_BITS) and the page size, the entropy in the
virtual displacement is up to 13 bits (16k/2 levels) and up to 25 bits (all
4 levels), with the sidenote that displacements that result in the kernel
image straddling a 1GB/32MB/512MB alignment boundary (for 4KB/16KB/64KB
granule kernels, respectively) are not allowed, and will be rounded up to
an acceptable value.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is enabled, the module region is
randomized independently from the core kernel. This makes it less likely
that the location of core kernel data structures can be determined by an
adversary, but causes all function calls from modules into the core kernel
to be resolved via entries in the module PLTs.
If CONFIG_RANDOMIZE_MODULE_REGION_FULL is not enabled, the module region is
randomized by choosing a page aligned 128 MB region inside the interval
[_etext - 128 MB, _stext + 128 MB). This gives between 10 and 14 bits of
entropy (depending on page size), independently of the kernel randomization,
but still guarantees that modules are within the range of relative branch
and jump instructions (with the caveat that, since the module region is
shared with other uses of the vmalloc area, modules may need to be loaded
further away if the module region is exhausted)
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2016-01-26 21:12:01 +08:00
|
|
|
return dt_virt;
|
|
|
|
}
|
2015-06-01 19:40:32 +08:00
|
|
|
|
2019-07-17 07:27:33 +08:00
|
|
|
int __init arch_ioremap_p4d_supported(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-02-16 20:52:35 +08:00
|
|
|
int __init arch_ioremap_pud_supported(void)
|
|
|
|
{
|
arm64/mm: Inhibit huge-vmap with ptdump
The arm64 ptdump code can race with concurrent modification of the
kernel page tables. At the time this was added, this was sound as:
* Modifications to leaf entries could result in stale information being
logged, but would not result in a functional problem.
* Boot time modifications to non-leaf entries (e.g. freeing of initmem)
were performed when the ptdump code cannot be invoked.
* At runtime, modifications to non-leaf entries only occurred in the
vmalloc region, and these were strictly additive, as intermediate
entries were never freed.
However, since commit:
commit 324420bf91f6 ("arm64: add support for ioremap() block mappings")
... it has been possible to create huge mappings in the vmalloc area at
runtime, and as part of this existing intermediate levels of table my be
removed and freed.
It's possible for the ptdump code to race with this, and continue to
walk tables which have been freed (and potentially poisoned or
reallocated). As a result of this, the ptdump code may dereference bogus
addresses, which could be fatal.
Since huge-vmap is a TLB and memory optimization, we can disable it when
the runtime ptdump code is in use to avoid this problem.
Cc: Catalin Marinas <catalin.marinas@arm.com>
Fixes: 324420bf91f60582 ("arm64: add support for ioremap() block mappings")
Acked-by: Ard Biesheuvel <ard.biesheuvel@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2019-05-14 17:00:06 +08:00
|
|
|
/*
|
|
|
|
* Only 4k granule supports level 1 block mappings.
|
|
|
|
* SW table walks can't handle removal of intermediate entries.
|
|
|
|
*/
|
|
|
|
return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
|
2020-02-04 09:36:29 +08:00
|
|
|
!IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
|
2016-02-16 20:52:35 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int __init arch_ioremap_pmd_supported(void)
|
|
|
|
{
|
arm64/mm: Inhibit huge-vmap with ptdump
The arm64 ptdump code can race with concurrent modification of the
kernel page tables. At the time this was added, this was sound as:
* Modifications to leaf entries could result in stale information being
logged, but would not result in a functional problem.
* Boot time modifications to non-leaf entries (e.g. freeing of initmem)
were performed when the ptdump code cannot be invoked.
* At runtime, modifications to non-leaf entries only occurred in the
vmalloc region, and these were strictly additive, as intermediate
entries were never freed.
However, since commit:
commit 324420bf91f6 ("arm64: add support for ioremap() block mappings")
... it has been possible to create huge mappings in the vmalloc area at
runtime, and as part of this existing intermediate levels of table my be
removed and freed.
It's possible for the ptdump code to race with this, and continue to
walk tables which have been freed (and potentially poisoned or
reallocated). As a result of this, the ptdump code may dereference bogus
addresses, which could be fatal.
Since huge-vmap is a TLB and memory optimization, we can disable it when
the runtime ptdump code is in use to avoid this problem.
Cc: Catalin Marinas <catalin.marinas@arm.com>
Fixes: 324420bf91f60582 ("arm64: add support for ioremap() block mappings")
Acked-by: Ard Biesheuvel <ard.biesheuvel@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
2019-05-14 17:00:06 +08:00
|
|
|
/* See arch_ioremap_pud_supported() */
|
2020-02-04 09:36:29 +08:00
|
|
|
return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
|
2016-02-16 20:52:35 +08:00
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
|
2016-02-16 20:52:35 +08:00
|
|
|
{
|
2019-05-27 11:58:15 +08:00
|
|
|
pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
|
2018-02-21 20:59:27 +08:00
|
|
|
|
2018-05-24 02:43:46 +08:00
|
|
|
/* Only allow permission changes for now */
|
|
|
|
if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
|
|
|
|
pud_val(new_pud)))
|
2018-02-21 20:59:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2019-05-27 15:03:29 +08:00
|
|
|
VM_BUG_ON(phys & ~PUD_MASK);
|
2018-05-24 02:43:46 +08:00
|
|
|
set_pud(pudp, new_pud);
|
2016-02-16 20:52:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
|
2016-02-16 20:52:35 +08:00
|
|
|
{
|
2019-05-27 11:58:15 +08:00
|
|
|
pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
|
2018-02-21 20:59:27 +08:00
|
|
|
|
2018-05-24 02:43:46 +08:00
|
|
|
/* Only allow permission changes for now */
|
|
|
|
if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
|
|
|
|
pmd_val(new_pmd)))
|
2018-02-21 20:59:27 +08:00
|
|
|
return 0;
|
|
|
|
|
2019-05-27 15:03:29 +08:00
|
|
|
VM_BUG_ON(phys & ~PMD_MASK);
|
2018-05-24 02:43:46 +08:00
|
|
|
set_pmd(pmdp, new_pmd);
|
2016-02-16 20:52:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
int pud_clear_huge(pud_t *pudp)
|
2016-02-16 20:52:35 +08:00
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
if (!pud_sect(READ_ONCE(*pudp)))
|
2016-02-16 20:52:35 +08:00
|
|
|
return 0;
|
2018-02-15 19:14:56 +08:00
|
|
|
pud_clear(pudp);
|
2016-02-16 20:52:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2018-02-15 19:14:56 +08:00
|
|
|
int pmd_clear_huge(pmd_t *pmdp)
|
2016-02-16 20:52:35 +08:00
|
|
|
{
|
2018-02-15 19:14:56 +08:00
|
|
|
if (!pmd_sect(READ_ONCE(*pmdp)))
|
2016-02-16 20:52:35 +08:00
|
|
|
return 0;
|
2018-02-15 19:14:56 +08:00
|
|
|
pmd_clear(pmdp);
|
2016-02-16 20:52:35 +08:00
|
|
|
return 1;
|
|
|
|
}
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
|
2018-06-06 15:01:21 +08:00
|
|
|
int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
{
|
2018-06-06 15:01:21 +08:00
|
|
|
pte_t *table;
|
|
|
|
pmd_t pmd;
|
|
|
|
|
|
|
|
pmd = READ_ONCE(*pmdp);
|
|
|
|
|
2018-09-06 00:38:57 +08:00
|
|
|
if (!pmd_table(pmd)) {
|
2018-12-28 16:37:42 +08:00
|
|
|
VM_WARN_ON(1);
|
2018-06-06 15:01:21 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
table = pte_offset_kernel(pmdp, addr);
|
|
|
|
pmd_clear(pmdp);
|
|
|
|
__flush_tlb_kernel_pgtable(addr);
|
|
|
|
pte_free_kernel(NULL, table);
|
|
|
|
return 1;
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
}
|
|
|
|
|
2018-06-06 15:01:21 +08:00
|
|
|
int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
{
|
2018-06-06 15:01:21 +08:00
|
|
|
pmd_t *table;
|
|
|
|
pmd_t *pmdp;
|
|
|
|
pud_t pud;
|
|
|
|
unsigned long next, end;
|
|
|
|
|
|
|
|
pud = READ_ONCE(*pudp);
|
|
|
|
|
2018-09-06 00:38:57 +08:00
|
|
|
if (!pud_table(pud)) {
|
2018-12-28 16:37:42 +08:00
|
|
|
VM_WARN_ON(1);
|
2018-06-06 15:01:21 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
table = pmd_offset(pudp, addr);
|
|
|
|
pmdp = table;
|
|
|
|
next = addr;
|
|
|
|
end = addr + PUD_SIZE;
|
|
|
|
do {
|
|
|
|
pmd_free_pte_page(pmdp, next);
|
|
|
|
} while (pmdp++, next += PMD_SIZE, next != end);
|
|
|
|
|
|
|
|
pud_clear(pudp);
|
|
|
|
__flush_tlb_kernel_pgtable(addr);
|
|
|
|
pmd_free(NULL, table);
|
|
|
|
return 1;
|
mm/vmalloc: add interfaces to free unmapped page table
On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may
create pud/pmd mappings. A kernel panic was observed on arm64 systems
with Cortex-A75 in the following steps as described by Hanjun Guo.
1. ioremap a 4K size, valid page table will build,
2. iounmap it, pte0 will set to 0;
3. ioremap the same address with 2M size, pgd/pmd is unchanged,
then set the a new value for pmd;
4. pte0 is leaked;
5. CPU may meet exception because the old pmd is still in TLB,
which will lead to kernel panic.
This panic is not reproducible on x86. INVLPG, called from iounmap,
purges all levels of entries associated with purged address on x86. x86
still has memory leak.
The patch changes the ioremap path to free unmapped page table(s) since
doing so in the unmap path has the following issues:
- The iounmap() path is shared with vunmap(). Since vmap() only
supports pte mappings, making vunmap() to free a pte page is an
overhead for regular vmap users as they do not need a pte page freed
up.
- Checking if all entries in a pte page are cleared in the unmap path
is racy, and serializing this check is expensive.
- The unmap path calls free_vmap_area_noflush() to do lazy TLB purges.
Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB
purge.
Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which
clear a given pud/pmd entry and free up a page for the lower level
entries.
This patch implements their stub functions on x86 and arm64, which work
as workaround.
[akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub]
Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com
Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings")
Reported-by: Lei Li <lious.lilei@hisilicon.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Wang Xuefeng <wxf.wang@hisilicon.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-23 07:17:20 +08:00
|
|
|
}
|
2018-12-12 02:48:48 +08:00
|
|
|
|
2018-12-28 16:37:53 +08:00
|
|
|
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
|
|
|
|
{
|
|
|
|
return 0; /* Don't attempt a block mapping */
|
|
|
|
}
|
|
|
|
|
2018-12-12 02:48:48 +08:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
2020-03-04 12:28:43 +08:00
|
|
|
static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
|
|
|
|
{
|
|
|
|
unsigned long end = start + size;
|
|
|
|
|
|
|
|
WARN_ON(pgdir != init_mm.pgd);
|
|
|
|
WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));
|
|
|
|
|
|
|
|
unmap_hotplug_range(start, end, false);
|
|
|
|
free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
|
|
|
|
}
|
|
|
|
|
2019-05-14 08:21:26 +08:00
|
|
|
int arch_add_memory(int nid, u64 start, u64 size,
|
2020-04-11 05:33:21 +08:00
|
|
|
struct mhp_params *params)
|
2018-12-12 02:48:48 +08:00
|
|
|
{
|
2020-03-04 12:28:43 +08:00
|
|
|
int ret, flags = 0;
|
2018-12-12 02:48:48 +08:00
|
|
|
|
|
|
|
if (rodata_full || debug_pagealloc_enabled())
|
|
|
|
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
|
|
|
|
|
|
|
|
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
|
mm/memory_hotplug: add pgprot_t to mhp_params
devm_memremap_pages() is currently used by the PCI P2PDMA code to create
struct page mappings for IO memory. At present, these mappings are
created with PAGE_KERNEL which implies setting the PAT bits to be WB.
However, on x86, an mtrr register will typically override this and force
the cache type to be UC-. In the case firmware doesn't set this
register it is effectively WB and will typically result in a machine
check exception when it's accessed.
Other arches are not currently likely to function correctly seeing they
don't have any MTRR registers to fall back on.
To solve this, provide a way to specify the pgprot value explicitly to
arch_add_memory().
Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a
simple change to pass the pgprot_t down to their respective functions
which set up the page tables. For x86_32, set the page tables
explicitly using _set_memory_prot() (seeing they are already mapped).
For ia64, s390 and sh, reject anything but PAGE_KERNEL settings -- this
should be fine, for now, seeing these architectures don't support
ZONE_DEVICE.
A check in __add_pages() is also added to ensure the pgprot parameter
was set for all arches.
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Badger <ebadger@gigaio.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Link: http://lkml.kernel.org/r/20200306170846.9333-7-logang@deltatee.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-11 05:33:36 +08:00
|
|
|
size, params->pgprot, __pgd_pgtable_alloc,
|
|
|
|
flags);
|
2018-12-12 02:48:48 +08:00
|
|
|
|
2019-11-07 09:43:21 +08:00
|
|
|
memblock_clear_nomap(start, size);
|
|
|
|
|
2020-03-04 12:28:43 +08:00
|
|
|
ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
|
2020-04-11 05:33:21 +08:00
|
|
|
params);
|
2020-03-04 12:28:43 +08:00
|
|
|
if (ret)
|
|
|
|
__remove_pgd_mapping(swapper_pg_dir,
|
|
|
|
__phys_to_virt(start), size);
|
|
|
|
return ret;
|
2018-12-12 02:48:48 +08:00
|
|
|
}
|
2020-03-04 12:28:43 +08:00
|
|
|
|
2019-07-19 06:56:41 +08:00
|
|
|
void arch_remove_memory(int nid, u64 start, u64 size,
|
|
|
|
struct vmem_altmap *altmap)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn = start >> PAGE_SHIFT;
|
|
|
|
unsigned long nr_pages = size >> PAGE_SHIFT;
|
|
|
|
|
2020-01-05 04:59:33 +08:00
|
|
|
__remove_pages(start_pfn, nr_pages, altmap);
|
2020-03-04 12:28:43 +08:00
|
|
|
__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This memory hotplug notifier helps prevent boot memory from being
|
|
|
|
* inadvertently removed as it blocks pfn range offlining process in
|
|
|
|
* __offline_pages(). Hence this prevents both offlining as well as
|
|
|
|
* removal process for boot memory which is initially always online.
|
|
|
|
* In future if and when boot memory could be removed, this notifier
|
|
|
|
* should be dropped and free_hotplug_page_range() should handle any
|
|
|
|
* reserved pages allocated during boot.
|
|
|
|
*/
|
|
|
|
static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
|
|
|
|
unsigned long action, void *data)
|
|
|
|
{
|
|
|
|
struct mem_section *ms;
|
|
|
|
struct memory_notify *arg = data;
|
|
|
|
unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
|
|
|
|
unsigned long pfn = arg->start_pfn;
|
|
|
|
|
|
|
|
if (action != MEM_GOING_OFFLINE)
|
|
|
|
return NOTIFY_OK;
|
|
|
|
|
|
|
|
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
|
|
|
ms = __pfn_to_section(pfn);
|
|
|
|
if (early_section(ms))
|
|
|
|
return NOTIFY_BAD;
|
|
|
|
}
|
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block prevent_bootmem_remove_nb = {
|
|
|
|
.notifier_call = prevent_bootmem_remove_notifier,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init prevent_bootmem_remove_init(void)
|
|
|
|
{
|
|
|
|
return register_memory_notifier(&prevent_bootmem_remove_nb);
|
2019-07-19 06:56:41 +08:00
|
|
|
}
|
2020-03-04 12:28:43 +08:00
|
|
|
device_initcall(prevent_bootmem_remove_init);
|
2019-07-19 06:56:41 +08:00
|
|
|
#endif
|