mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-27 14:14:24 +08:00
Merge patch series "Svvptc extension to remove preventive sfence.vma"
Alexandre Ghiti <alexghiti@rivosinc.com> says: In RISC-V, after a new mapping is established, a sfence.vma needs to be emitted for different reasons: - if the uarch caches invalid entries, we need to invalidate it otherwise we would trap on this invalid entry, - if the uarch does not cache invalid entries, a reordered access could fail to see the new mapping and then trap (sfence.vma acts as a fence). We can actually avoid emitting those (mostly) useless and costly sfence.vma by handling the traps instead: - for new kernel mappings: only vmalloc mappings need to be taken care of, other new mapping are rare and already emit the required sfence.vma if needed. That must be achieved very early in the exception path as explained in patch 3, and this also fixes our fragile way of dealing with vmalloc faults. - for new user mappings: Svvptc makes update_mmu_cache() a no-op but we can take some gratuitous page faults (which are very unlikely though). Patch 1 and 2 introduce Svvptc extension probing. On our uarch that does not cache invalid entries and a 6.5 kernel, the gains are measurable: * Kernel boot: 6% * ltp - mmapstress01: 8% * lmbench - lat_pagefault: 20% * lmbench - lat_mmap: 5% Here are the corresponding numbers of sfence.vma emitted: * Ubuntu boot to login: Before: ~630k sfence.vma After: ~200k sfence.vma * ltp - mmapstress01 Before: ~45k After: ~6.3k * lmbench - lat_pagefault Before: ~665k After: 832 (!) * lmbench - lat_mmap Before: ~546k After: 718 (!) Thanks to Ved and Matt Evans for triggering the discussion that led to this patchset! * b4-shazam-merge: riscv: Stop emitting preventive sfence.vma for new userspace mappings with Svvptc riscv: Stop emitting preventive sfence.vma for new vmalloc mappings dt-bindings: riscv: Add Svvptc ISA extension description riscv: Add ISA extension parsing for Svvptc Link: https://lore.kernel.org/r/20240717060125.139416-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
This commit is contained in:
commit
7e340f4fad
@ -171,6 +171,13 @@ properties:
|
|||||||
memory types as ratified in the 20191213 version of the privileged
|
memory types as ratified in the 20191213 version of the privileged
|
||||||
ISA specification.
|
ISA specification.
|
||||||
|
|
||||||
|
- const: svvptc
|
||||||
|
description:
|
||||||
|
The standard Svvptc supervisor-level extension for
|
||||||
|
address-translation cache behaviour with respect to invalid entries
|
||||||
|
as ratified at commit 4a69197e5617 ("Update to ratified state") of
|
||||||
|
riscv-svvptc.
|
||||||
|
|
||||||
- const: zacas
|
- const: zacas
|
||||||
description: |
|
description: |
|
||||||
The Zacas extension for Atomic Compare-and-Swap (CAS) instructions
|
The Zacas extension for Atomic Compare-and-Swap (CAS) instructions
|
||||||
|
@ -46,7 +46,23 @@ do { \
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#ifdef CONFIG_64BIT
|
#ifdef CONFIG_64BIT
|
||||||
#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end)
|
extern u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
|
||||||
|
extern char _end[];
|
||||||
|
#define flush_cache_vmap flush_cache_vmap
|
||||||
|
static inline void flush_cache_vmap(unsigned long start, unsigned long end)
|
||||||
|
{
|
||||||
|
if (is_vmalloc_or_module_addr((void *)start)) {
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We don't care if concurrently a cpu resets this value since
|
||||||
|
* the only place this can happen is in handle_exception() where
|
||||||
|
* an sfence.vma is emitted.
|
||||||
|
*/
|
||||||
|
for (i = 0; i < ARRAY_SIZE(new_vmalloc); ++i)
|
||||||
|
new_vmalloc[i] = -1ULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
#define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end)
|
#define flush_cache_vmap_early(start, end) local_flush_tlb_kernel_range(start, end)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -92,6 +92,7 @@
|
|||||||
#define RISCV_ISA_EXT_ZCF 83
|
#define RISCV_ISA_EXT_ZCF 83
|
||||||
#define RISCV_ISA_EXT_ZCMOP 84
|
#define RISCV_ISA_EXT_ZCMOP 84
|
||||||
#define RISCV_ISA_EXT_ZAWRS 85
|
#define RISCV_ISA_EXT_ZAWRS 85
|
||||||
|
#define RISCV_ISA_EXT_SVVPTC 86
|
||||||
|
|
||||||
#define RISCV_ISA_EXT_XLINUXENVCFG 127
|
#define RISCV_ISA_EXT_XLINUXENVCFG 127
|
||||||
|
|
||||||
|
@ -497,6 +497,9 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
|
|||||||
struct vm_area_struct *vma, unsigned long address,
|
struct vm_area_struct *vma, unsigned long address,
|
||||||
pte_t *ptep, unsigned int nr)
|
pte_t *ptep, unsigned int nr)
|
||||||
{
|
{
|
||||||
|
asm goto(ALTERNATIVE("nop", "j %l[svvptc]", 0, RISCV_ISA_EXT_SVVPTC, 1)
|
||||||
|
: : : : svvptc);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The kernel assumes that TLBs don't cache invalid entries, but
|
* The kernel assumes that TLBs don't cache invalid entries, but
|
||||||
* in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
|
* in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
|
||||||
@ -506,6 +509,13 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
|
|||||||
*/
|
*/
|
||||||
while (nr--)
|
while (nr--)
|
||||||
local_flush_tlb_page(address + nr * PAGE_SIZE);
|
local_flush_tlb_page(address + nr * PAGE_SIZE);
|
||||||
|
|
||||||
|
svvptc:;
|
||||||
|
/*
|
||||||
|
* Svvptc guarantees that the new valid pte will be visible within
|
||||||
|
* a bounded timeframe, so when the uarch does not cache invalid
|
||||||
|
* entries, we don't have to do anything.
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
#define update_mmu_cache(vma, addr, ptep) \
|
#define update_mmu_cache(vma, addr, ptep) \
|
||||||
update_mmu_cache_range(NULL, vma, addr, ptep, 1)
|
update_mmu_cache_range(NULL, vma, addr, ptep, 1)
|
||||||
|
@ -61,6 +61,13 @@ struct thread_info {
|
|||||||
void *scs_base;
|
void *scs_base;
|
||||||
void *scs_sp;
|
void *scs_sp;
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_64BIT
|
||||||
|
/*
|
||||||
|
* Used in handle_exception() to save a0, a1 and a2 before knowing if we
|
||||||
|
* can access the kernel stack.
|
||||||
|
*/
|
||||||
|
unsigned long a0, a1, a2;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_SHADOW_CALL_STACK
|
#ifdef CONFIG_SHADOW_CALL_STACK
|
||||||
|
@ -36,6 +36,8 @@ void asm_offsets(void)
|
|||||||
OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
|
OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
|
||||||
OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
|
OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
|
||||||
OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
|
OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
|
||||||
|
|
||||||
|
OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
|
||||||
OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
|
OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
|
||||||
OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
|
OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
|
||||||
OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
|
OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
|
||||||
@ -43,6 +45,11 @@ void asm_offsets(void)
|
|||||||
#ifdef CONFIG_SHADOW_CALL_STACK
|
#ifdef CONFIG_SHADOW_CALL_STACK
|
||||||
OFFSET(TASK_TI_SCS_SP, task_struct, thread_info.scs_sp);
|
OFFSET(TASK_TI_SCS_SP, task_struct, thread_info.scs_sp);
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_64BIT
|
||||||
|
OFFSET(TASK_TI_A0, task_struct, thread_info.a0);
|
||||||
|
OFFSET(TASK_TI_A1, task_struct, thread_info.a1);
|
||||||
|
OFFSET(TASK_TI_A2, task_struct, thread_info.a2);
|
||||||
|
#endif
|
||||||
|
|
||||||
OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
|
OFFSET(TASK_TI_CPU_NUM, task_struct, thread_info.cpu);
|
||||||
OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]);
|
OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]);
|
||||||
|
@ -381,6 +381,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
|
|||||||
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
|
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
|
||||||
__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
|
__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
|
||||||
__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
|
__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
|
||||||
|
__RISCV_ISA_EXT_DATA(svvptc, RISCV_ISA_EXT_SVVPTC),
|
||||||
};
|
};
|
||||||
|
|
||||||
const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext);
|
const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext);
|
||||||
|
@ -19,6 +19,79 @@
|
|||||||
|
|
||||||
.section .irqentry.text, "ax"
|
.section .irqentry.text, "ax"
|
||||||
|
|
||||||
|
.macro new_vmalloc_check
|
||||||
|
REG_S a0, TASK_TI_A0(tp)
|
||||||
|
csrr a0, CSR_CAUSE
|
||||||
|
/* Exclude IRQs */
|
||||||
|
blt a0, zero, _new_vmalloc_restore_context_a0
|
||||||
|
|
||||||
|
REG_S a1, TASK_TI_A1(tp)
|
||||||
|
/* Only check new_vmalloc if we are in page/protection fault */
|
||||||
|
li a1, EXC_LOAD_PAGE_FAULT
|
||||||
|
beq a0, a1, _new_vmalloc_kernel_address
|
||||||
|
li a1, EXC_STORE_PAGE_FAULT
|
||||||
|
beq a0, a1, _new_vmalloc_kernel_address
|
||||||
|
li a1, EXC_INST_PAGE_FAULT
|
||||||
|
bne a0, a1, _new_vmalloc_restore_context_a1
|
||||||
|
|
||||||
|
_new_vmalloc_kernel_address:
|
||||||
|
/* Is it a kernel address? */
|
||||||
|
csrr a0, CSR_TVAL
|
||||||
|
bge a0, zero, _new_vmalloc_restore_context_a1
|
||||||
|
|
||||||
|
/* Check if a new vmalloc mapping appeared that could explain the trap */
|
||||||
|
REG_S a2, TASK_TI_A2(tp)
|
||||||
|
/*
|
||||||
|
* Computes:
|
||||||
|
* a0 = &new_vmalloc[BIT_WORD(cpu)]
|
||||||
|
* a1 = BIT_MASK(cpu)
|
||||||
|
*/
|
||||||
|
REG_L a2, TASK_TI_CPU(tp)
|
||||||
|
/*
|
||||||
|
* Compute the new_vmalloc element position:
|
||||||
|
* (cpu / 64) * 8 = (cpu >> 6) << 3
|
||||||
|
*/
|
||||||
|
srli a1, a2, 6
|
||||||
|
slli a1, a1, 3
|
||||||
|
la a0, new_vmalloc
|
||||||
|
add a0, a0, a1
|
||||||
|
/*
|
||||||
|
* Compute the bit position in the new_vmalloc element:
|
||||||
|
* bit_pos = cpu % 64 = cpu - (cpu / 64) * 64 = cpu - (cpu >> 6) << 6
|
||||||
|
* = cpu - ((cpu >> 6) << 3) << 3
|
||||||
|
*/
|
||||||
|
slli a1, a1, 3
|
||||||
|
sub a1, a2, a1
|
||||||
|
/* Compute the "get mask": 1 << bit_pos */
|
||||||
|
li a2, 1
|
||||||
|
sll a1, a2, a1
|
||||||
|
|
||||||
|
/* Check the value of new_vmalloc for this cpu */
|
||||||
|
REG_L a2, 0(a0)
|
||||||
|
and a2, a2, a1
|
||||||
|
beq a2, zero, _new_vmalloc_restore_context
|
||||||
|
|
||||||
|
/* Atomically reset the current cpu bit in new_vmalloc */
|
||||||
|
amoxor.d a0, a1, (a0)
|
||||||
|
|
||||||
|
/* Only emit a sfence.vma if the uarch caches invalid entries */
|
||||||
|
ALTERNATIVE("sfence.vma", "nop", 0, RISCV_ISA_EXT_SVVPTC, 1)
|
||||||
|
|
||||||
|
REG_L a0, TASK_TI_A0(tp)
|
||||||
|
REG_L a1, TASK_TI_A1(tp)
|
||||||
|
REG_L a2, TASK_TI_A2(tp)
|
||||||
|
csrw CSR_SCRATCH, x0
|
||||||
|
sret
|
||||||
|
|
||||||
|
_new_vmalloc_restore_context:
|
||||||
|
REG_L a2, TASK_TI_A2(tp)
|
||||||
|
_new_vmalloc_restore_context_a1:
|
||||||
|
REG_L a1, TASK_TI_A1(tp)
|
||||||
|
_new_vmalloc_restore_context_a0:
|
||||||
|
REG_L a0, TASK_TI_A0(tp)
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
SYM_CODE_START(handle_exception)
|
SYM_CODE_START(handle_exception)
|
||||||
/*
|
/*
|
||||||
* If coming from userspace, preserve the user thread pointer and load
|
* If coming from userspace, preserve the user thread pointer and load
|
||||||
@ -30,6 +103,20 @@ SYM_CODE_START(handle_exception)
|
|||||||
|
|
||||||
.Lrestore_kernel_tpsp:
|
.Lrestore_kernel_tpsp:
|
||||||
csrr tp, CSR_SCRATCH
|
csrr tp, CSR_SCRATCH
|
||||||
|
|
||||||
|
#ifdef CONFIG_64BIT
|
||||||
|
/*
|
||||||
|
* The RISC-V kernel does not eagerly emit a sfence.vma after each
|
||||||
|
* new vmalloc mapping, which may result in exceptions:
|
||||||
|
* - if the uarch caches invalid entries, the new mapping would not be
|
||||||
|
* observed by the page table walker and an invalidation is needed.
|
||||||
|
* - if the uarch does not cache invalid entries, a reordered access
|
||||||
|
* could "miss" the new mapping and traps: in that case, we only need
|
||||||
|
* to retry the access, no sfence.vma is required.
|
||||||
|
*/
|
||||||
|
new_vmalloc_check
|
||||||
|
#endif
|
||||||
|
|
||||||
REG_S sp, TASK_TI_KERNEL_SP(tp)
|
REG_S sp, TASK_TI_KERNEL_SP(tp)
|
||||||
|
|
||||||
#ifdef CONFIG_VMAP_STACK
|
#ifdef CONFIG_VMAP_STACK
|
||||||
|
@ -37,6 +37,8 @@
|
|||||||
|
|
||||||
#include "../kernel/head.h"
|
#include "../kernel/head.h"
|
||||||
|
|
||||||
|
u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
|
||||||
|
|
||||||
struct kernel_mapping kernel_map __ro_after_init;
|
struct kernel_mapping kernel_map __ro_after_init;
|
||||||
EXPORT_SYMBOL(kernel_map);
|
EXPORT_SYMBOL(kernel_map);
|
||||||
#ifdef CONFIG_XIP_KERNEL
|
#ifdef CONFIG_XIP_KERNEL
|
||||||
|
@ -9,6 +9,9 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
|
|||||||
unsigned long address, pte_t *ptep,
|
unsigned long address, pte_t *ptep,
|
||||||
pte_t entry, int dirty)
|
pte_t entry, int dirty)
|
||||||
{
|
{
|
||||||
|
asm goto(ALTERNATIVE("nop", "j %l[svvptc]", 0, RISCV_ISA_EXT_SVVPTC, 1)
|
||||||
|
: : : : svvptc);
|
||||||
|
|
||||||
if (!pte_same(ptep_get(ptep), entry))
|
if (!pte_same(ptep_get(ptep), entry))
|
||||||
__set_pte_at(vma->vm_mm, ptep, entry);
|
__set_pte_at(vma->vm_mm, ptep, entry);
|
||||||
/*
|
/*
|
||||||
@ -16,6 +19,16 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
|
|||||||
* the case that the PTE changed and the spurious fault case.
|
* the case that the PTE changed and the spurious fault case.
|
||||||
*/
|
*/
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
svvptc:
|
||||||
|
if (!pte_same(ptep_get(ptep), entry)) {
|
||||||
|
__set_pte_at(vma->vm_mm, ptep, entry);
|
||||||
|
/* Here only not svadu is impacted */
|
||||||
|
flush_tlb_page(vma, address);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ptep_test_and_clear_young(struct vm_area_struct *vma,
|
int ptep_test_and_clear_young(struct vm_area_struct *vma,
|
||||||
|
Loading…
Reference in New Issue
Block a user