linux/arch/arm64/include/asm/mmu_context.h
Pingfan Liu c4885bbb3a arm64/mm: save memory access in check_and_switch_context() fast switch path
On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
using the per-cpu offset stored in the tpidr_el1 system register. In
some cases we generate a per-cpu address with a sequence like:

  cpu_ptr = &per_cpu(ptr, smp_processor_id());

Which potentially incurs a cache miss for both `cpu_number` and the
in-memory `__per_cpu_offset` array. This can be written more optimally
as:

  cpu_ptr = this_cpu_ptr(ptr);

Which only needs the offset from tpidr_el1, and does not need to
load from memory.

The following two test cases show a small performance improvement measured
on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel.

Test 1: (about 0.3% improvement)
    #cat b.sh
    make clean && make all -j138
    #perf stat --repeat 10 --null --sync sh b.sh

    - before this patch
     Performance counter stats for 'sh b.sh' (10 runs):

                298.62 +- 1.86 seconds time elapsed  ( +-  0.62% )

    - after this patch
     Performance counter stats for 'sh b.sh' (10 runs):

               297.734 +- 0.954 seconds time elapsed  ( +-  0.32% )

Test 2: (about 1.69% improvement)
     'perf stat -r 10 perf bench sched messaging'
        Then sum the total time of 'sched/messaging' by manual.

    - before this patch
      total 0.707 sec for 10 times
    - after this patch
      totol 0.695 sec for 10 times

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/1594389852-19949-1-git-send-email-kernelfans@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
2020-07-30 12:58:40 +01:00

254 lines
6.6 KiB
C

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Based on arch/arm/include/asm/mmu_context.h
*
* Copyright (C) 1996 Russell King.
* Copyright (C) 2012 ARM Ltd.
*/
#ifndef __ASM_MMU_CONTEXT_H
#define __ASM_MMU_CONTEXT_H
#ifndef __ASSEMBLY__
#include <linux/compiler.h>
#include <linux/sched.h>
#include <linux/sched/hotplug.h>
#include <linux/mm_types.h>
#include <linux/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/proc-fns.h>
#include <asm-generic/mm_hooks.h>
#include <asm/cputype.h>
#include <asm/sysreg.h>
#include <asm/tlbflush.h>
extern bool rodata_full;
static inline void contextidr_thread_switch(struct task_struct *next)
{
if (!IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
return;
write_sysreg(task_pid_nr(next), contextidr_el1);
isb();
}
/*
* Set TTBR0 to empty_zero_page. No translations will be possible via TTBR0.
*/
static inline void cpu_set_reserved_ttbr0(void)
{
unsigned long ttbr = phys_to_ttbr(__pa_symbol(empty_zero_page));
write_sysreg(ttbr, ttbr0_el1);
isb();
}
void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm);
static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm)
{
BUG_ON(pgd == swapper_pg_dir);
cpu_set_reserved_ttbr0();
cpu_do_switch_mm(virt_to_phys(pgd),mm);
}
/*
* TCR.T0SZ value to use when the ID map is active. Usually equals
* TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in
* physical memory, in which case it will be smaller.
*/
extern u64 idmap_t0sz;
extern u64 idmap_ptrs_per_pgd;
static inline bool __cpu_uses_extended_idmap(void)
{
if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52))
return false;
return unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS));
}
/*
* True if the extended ID map requires an extra level of translation table
* to be configured.
*/
static inline bool __cpu_uses_extended_idmap_level(void)
{
return ARM64_HW_PGTABLE_LEVELS(64 - idmap_t0sz) > CONFIG_PGTABLE_LEVELS;
}
/*
* Set TCR.T0SZ to its default value (based on VA_BITS)
*/
static inline void __cpu_set_tcr_t0sz(unsigned long t0sz)
{
unsigned long tcr;
if (!__cpu_uses_extended_idmap())
return;
tcr = read_sysreg(tcr_el1);
tcr &= ~TCR_T0SZ_MASK;
tcr |= t0sz << TCR_T0SZ_OFFSET;
write_sysreg(tcr, tcr_el1);
isb();
}
#define cpu_set_default_tcr_t0sz() __cpu_set_tcr_t0sz(TCR_T0SZ(vabits_actual))
#define cpu_set_idmap_tcr_t0sz() __cpu_set_tcr_t0sz(idmap_t0sz)
/*
* Remove the idmap from TTBR0_EL1 and install the pgd of the active mm.
*
* The idmap lives in the same VA range as userspace, but uses global entries
* and may use a different TCR_EL1.T0SZ. To avoid issues resulting from
* speculative TLB fetches, we must temporarily install the reserved page
* tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ.
*
* If current is a not a user task, the mm covers the TTBR1_EL1 page tables,
* which should not be installed in TTBR0_EL1. In this case we can leave the
* reserved page tables in place.
*/
static inline void cpu_uninstall_idmap(void)
{
struct mm_struct *mm = current->active_mm;
cpu_set_reserved_ttbr0();
local_flush_tlb_all();
cpu_set_default_tcr_t0sz();
if (mm != &init_mm && !system_uses_ttbr0_pan())
cpu_switch_mm(mm->pgd, mm);
}
static inline void cpu_install_idmap(void)
{
cpu_set_reserved_ttbr0();
local_flush_tlb_all();
cpu_set_idmap_tcr_t0sz();
cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm);
}
/*
* Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
* avoiding the possibility of conflicting TLB entries being allocated.
*/
static inline void cpu_replace_ttbr1(pgd_t *pgdp)
{
typedef void (ttbr_replace_func)(phys_addr_t);
extern ttbr_replace_func idmap_cpu_replace_ttbr1;
ttbr_replace_func *replace_phys;
/* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
if (system_supports_cnp() && !WARN_ON(pgdp != lm_alias(swapper_pg_dir))) {
/*
* cpu_replace_ttbr1() is used when there's a boot CPU
* up (i.e. cpufeature framework is not up yet) and
* latter only when we enable CNP via cpufeature's
* enable() callback.
* Also we rely on the cpu_hwcap bit being set before
* calling the enable() function.
*/
ttbr1 |= TTBR_CNP_BIT;
}
replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
cpu_install_idmap();
replace_phys(ttbr1);
cpu_uninstall_idmap();
}
/*
* It would be nice to return ASIDs back to the allocator, but unfortunately
* that introduces a race with a generation rollover where we could erroneously
* free an ASID allocated in a future generation. We could workaround this by
* freeing the ASID from the context of the dying mm (e.g. in arch_exit_mmap),
* but we'd then need to make sure that we didn't dirty any TLBs afterwards.
* Setting a reserved TTBR0 or EPD0 would work, but it all gets ugly when you
* take CPU migration into account.
*/
#define destroy_context(mm) do { } while(0)
void check_and_switch_context(struct mm_struct *mm);
#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
static inline void update_saved_ttbr0(struct task_struct *tsk,
struct mm_struct *mm)
{
u64 ttbr;
if (!system_uses_ttbr0_pan())
return;
if (mm == &init_mm)
ttbr = __pa_symbol(empty_zero_page);
else
ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48;
WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
}
#else
static inline void update_saved_ttbr0(struct task_struct *tsk,
struct mm_struct *mm)
{
}
#endif
static inline void
enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
/*
* We don't actually care about the ttbr0 mapping, so point it at the
* zero page.
*/
update_saved_ttbr0(tsk, &init_mm);
}
static inline void __switch_mm(struct mm_struct *next)
{
/*
* init_mm.pgd does not contain any user mappings and it is always
* active for kernel addresses in TTBR1. Just set the reserved TTBR0.
*/
if (next == &init_mm) {
cpu_set_reserved_ttbr0();
return;
}
check_and_switch_context(next);
}
static inline void
switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
if (prev != next)
__switch_mm(next);
/*
* Update the saved TTBR0_EL1 of the scheduled-in task as the previous
* value may have not been initialised yet (activate_mm caller) or the
* ASID has changed since the last run (following the context switch
* of another thread of the same process).
*/
update_saved_ttbr0(tsk, next);
}
#define deactivate_mm(tsk,mm) do { } while (0)
#define activate_mm(prev,next) switch_mm(prev, next, current)
void verify_cpu_asid_bits(void);
void post_ttbr_update_workaround(void);
#endif /* !__ASSEMBLY__ */
#endif /* !__ASM_MMU_CONTEXT_H */