From bcc9d04e749a8cbdbe1b26285f0f69e315c70821 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:40 +0100 Subject: [PATCH 001/168] mm: Introduce ARCH_HAS_USER_SHADOW_STACK Since multiple architectures have support for shadow stacks and we need to select support for this feature in several places in the generic code provide a generic config option that the architectures can select. Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Deepak Gupta Reviewed-by: Rick Edgecombe Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Catalin Marinas Reviewed-by: Kees Cook Tested-by: Kees Cook Acked-by: Shuah Khan Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-1-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/x86/Kconfig | 1 + fs/proc/task_mmu.c | 2 +- mm/Kconfig | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2852fcd82cbd..8ccae77d40f7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1954,6 +1954,7 @@ config X86_USER_SHADOW_STACK depends on AS_WRUSS depends on X86_64 select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_USER_SHADOW_STACK select X86_CET help Shadow stack protection is a hardware feature that detects function diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 72f14fd59c2d..23f875e78eae 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -971,7 +971,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -#ifdef CONFIG_X86_USER_SHADOW_STACK +#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) diff --git a/mm/Kconfig b/mm/Kconfig index 4c9f5ea13271..4b2a1ef9a161 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1296,6 +1296,12 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. +config ARCH_HAS_USER_SHADOW_STACK + bool + help + The architecture has hardware support for userspace shadow call + stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). + source "mm/damon/Kconfig" endmenu From 9ab515b18f8463fbb340fece47cd461809e42a9d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:41 +0100 Subject: [PATCH 002/168] mm: Define VM_HIGH_ARCH_6 The addition of protection keys means that on arm64 we now use all of the currently defined VM_HIGH_ARCH_x bits. In order to allow us to allocate a new flag for GCS pages define VM_HIGH_ARCH_6. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-2-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index ecf63d2b0582..182bad0c55df 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -329,12 +329,14 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS From f645e888b1a6760532d8d89714cb698dd52d89bd Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:42 +0100 Subject: [PATCH 003/168] arm64/mm: Restructure arch_validate_flags() for extensibility Currently arch_validate_flags() is written in a very non-extensible fashion, returning immediately if MTE is not supported and writing the MTE check as a direct return. Since we will want to add more checks for GCS refactor the existing code to be more extensible, no functional change intended. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-3-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mman.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 9e39217b4afb..03b790fd0ad8 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -62,11 +62,17 @@ static inline bool arch_validate_prot(unsigned long prot, static inline bool arch_validate_flags(unsigned long vm_flags) { - if (!system_supports_mte()) - return true; + if (system_supports_mte()) { + /* + * only allow VM_MTE if VM_MTE_ALLOWED has been set + * previously + */ + if ((vm_flags & VM_MTE) && !(vm_flags & VM_MTE_ALLOWED)) + return false; + } + + return true; - /* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */ - return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED); } #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags) From 91e102e79740ae43ded050ccac71aa3371db4f33 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:43 +0100 Subject: [PATCH 004/168] prctl: arch-agnostic prctl for shadow stack Three architectures (x86, aarch64, riscv) have announced support for shadow stacks with fairly similar functionality. While x86 is using arch_prctl() to control the functionality neither arm64 nor riscv uses that interface so this patch adds arch-agnostic prctl() support to get and set status of shadow stacks and lock the current configuation to prevent further changes, with support for turning on and off individual subfeatures so applications can limit their exposure to features that they do not need. The features are: - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks, including allocation of a shadow stack if one is not already allocated. - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow stack. - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack. These features are expected to be inherited by new threads and cleared on exec(), unknown features should be rejected for enable but accepted for locking (in order to allow for future proofing). This is based on a patch originally written by Deepak Gupta but modified fairly heavily, support for indirect landing pads is removed, additional modes added and the locking interface reworked. The set status prctl() is also reworked to just set flags, if setting/reading the shadow stack pointer is required this could be a separate prctl. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Reviewed-by: Deepak Gupta Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-4-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- include/linux/mm.h | 4 ++++ include/uapi/linux/prctl.h | 22 ++++++++++++++++++++++ kernel/sys.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 182bad0c55df..56654306a832 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4221,4 +4221,8 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) } #endif /* CONFIG_MEM_ALLOC_PROFILING */ +int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); +int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); +int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); + #endif /* _LINUX_MM_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 35791791a879..557a3d2ac1d4 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -328,4 +328,26 @@ struct prctl_mm_map { # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ # define PR_PPC_DEXCR_CTRL_MASK 0x1f +/* + * Get the current shadow stack configuration for the current thread, + * this will be the value configured via PR_SET_SHADOW_STACK_STATUS. + */ +#define PR_GET_SHADOW_STACK_STATUS 74 + +/* + * Set the current shadow stack configuration. Enabling the shadow + * stack will cause a shadow stack to be allocated for the thread. + */ +#define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) +# define PR_SHADOW_STACK_WRITE (1UL << 1) +# define PR_SHADOW_STACK_PUSH (1UL << 2) + +/* + * Prevent further changes to the specified shadow stack + * configuration. All bits may be locked via this call, including + * undefined bits. + */ +#define PR_LOCK_SHADOW_STACK_STATUS 76 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 4da31f28fda8..3d38a9c7c5c9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2324,6 +2324,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } +int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status) +{ + return -EINVAL; +} + +int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + +int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status) +{ + return -EINVAL; +} + #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) #ifdef CONFIG_ANON_VMA_NAME @@ -2784,6 +2799,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_RISCV_SET_ICACHE_FLUSH_CTX: error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); break; + case PR_GET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2); + break; + case PR_SET_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_set_shadow_stack_status(me, arg2); + break; + case PR_LOCK_SHADOW_STACK_STATUS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_lock_shadow_stack_status(me, arg2); + break; default: error = -EINVAL; break; From 3630e82ab6bd2642f0fc03b574783ccf2fb0c955 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:44 +0100 Subject: [PATCH 005/168] mman: Add map_shadow_stack() flags In preparation for adding arm64 GCS support make the map_shadow_stack() SHADOW_STACK_SET_TOKEN flag generic and add _SET_MARKER. The existing flag indicates that a token usable for stack switch should be added to the top of the newly mapped GCS region while the new flag indicates that a top of stack marker suitable for use by unwinders should be added above that. For arm64 the top of stack marker is all bits 0. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-5-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/x86/include/uapi/asm/mman.h | 3 --- include/uapi/asm-generic/mman.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 46cdc941f958..ac1e6277212b 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -5,9 +5,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ #define MAP_ABOVE4G 0x80 /* only map above 4GB */ -/* Flags for map_shadow_stack(2) */ -#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ - #include #endif /* _ASM_X86_MMAN_H */ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 57e8195d0b53..5e3d61ddbd8c 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -19,4 +19,8 @@ #define MCL_FUTURE 2 /* lock all future mappings */ #define MCL_ONFAULT 4 /* lock all pages that are faulted in */ +#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ +#define SHADOW_STACK_SET_MARKER (1ULL << 1) /* Set up a top of stack marker in the shadow stack */ + + #endif /* __ASM_GENERIC_MMAN_H */ From 830ae8a39685e330b70437529912c17337380ae4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:45 +0100 Subject: [PATCH 006/168] arm64: Document boot requirements for Guarded Control Stacks FEAT_GCS introduces a number of new system registers, we require that access to these registers is not trapped when we identify that the feature is present. There is also a HCRX_EL2 control to make GCS operations functional. Since if GCS is enabled any function call instruction will cause a fault we also require that the feature be specifically disabled, existing kernels implicitly have this requirement and especially given that the MMU must be disabled it is difficult to see a situation where leaving GCS enabled would be reasonable. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-6-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/booting.rst | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index b57776a68f15..aed6e9f47cf3 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -411,6 +411,38 @@ Before jumping into the kernel, the following conditions must be met: - HFGRWR_EL2.nPIRE0_EL1 (bit 57) must be initialised to 0b1. + - For CPUs with Guarded Control Stacks (FEAT_GCS): + + - GCSCR_EL1 must be initialised to 0. + + - GCSCRE0_EL1 must be initialised to 0. + + - If EL3 is present: + + - SCR_EL3.GCSEn (bit 39) must be initialised to 0b1. + + - If EL2 is present: + + - GCSCR_EL2 must be initialised to 0. + + - If the kernel is entered at EL1 and EL2 is present: + + - HCRX_EL2.GCSEn must be initialised to 0b1. + + - HFGITR_EL2.nGCSEPP (bit 59) must be initialised to 0b1. + + - HFGITR_EL2.nGCSSTR_EL1 (bit 58) must be initialised to 0b1. + + - HFGITR_EL2.nGCSPUSHM_EL1 (bit 57) must be initialised to 0b1. + + - HFGRTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1. + + - HFGRTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1. + + - HFGWTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1. + + - HFGWTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1. + The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must enter the kernel in the same exception level. Where the values documented From 7058bf87cd597e0433c2e8207139f922b9df3ef8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:46 +0100 Subject: [PATCH 007/168] arm64/gcs: Document the ABI for Guarded Control Stacks Add some documentation of the userspace ABI for Guarded Control Stacks. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-7-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/gcs.rst | 230 +++++++++++++++++++++++++++++ Documentation/arch/arm64/index.rst | 1 + 2 files changed, 231 insertions(+) create mode 100644 Documentation/arch/arm64/gcs.rst diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst new file mode 100644 index 000000000000..af58d9151cb7 --- /dev/null +++ b/Documentation/arch/arm64/gcs.rst @@ -0,0 +1,230 @@ +=============================================== +Guarded Control Stack support for AArch64 Linux +=============================================== + +This document outlines briefly the interface provided to userspace by Linux in +order to support use of the ARM Guarded Control Stack (GCS) feature. + +This is an outline of the most important features and issues only and not +intended to be exhaustive. + + + +1. General +----------- + +* GCS is an architecture feature intended to provide greater protection + against return oriented programming (ROP) attacks and to simplify the + implementation of features that need to collect stack traces such as + profiling. + +* When GCS is enabled a separate guarded control stack is maintained by the + PE which is writeable only through specific GCS operations. This + stores the call stack only, when a procedure call instruction is + performed the current PC is pushed onto the GCS and on RET the + address in the LR is verified against that on the top of the GCS. + +* When active the current GCS pointer is stored in the system register + GCSPR_EL0. This is readable by userspace but can only be updated + via specific GCS instructions. + +* The architecture provides instructions for switching between guarded + control stacks with checks to ensure that the new stack is a valid + target for switching. + +* The functionality of GCS is similar to that provided by the x86 Shadow + Stack feature, due to sharing of userspace interfaces the ABI refers to + shadow stacks rather than GCS. + +* Support for GCS is reported to userspace via HWCAP_GCS in the aux vector + AT_HWCAP2 entry. + +* GCS is enabled per thread. While there is support for disabling GCS + at runtime this should be done with great care. + +* GCS memory access faults are reported as normal memory access faults. + +* GCS specific errors (those reported with EC 0x2d) will be reported as + SIGSEGV with a si_code of SEGV_CPERR (control protection error). + +* GCS is supported only for AArch64. + +* On systems where GCS is supported GCSPR_EL0 is always readable by EL0 + regardless of the GCS configuration for the thread. + +* The architecture supports enabling GCS without verifying that return values + in LR match those in the GCS, the LR will be ignored. This is not supported + by Linux. + + + +2. Enabling and disabling Guarded Control Stacks +------------------------------------------------- + +* GCS is enabled and disabled for a thread via the PR_SET_SHADOW_STACK_STATUS + prctl(), this takes a single flags argument specifying which GCS features + should be used. + +* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack + and enables GCS for the thread, enabling the functionality controlled by + GCSCRE0_EL1.{nTR, RVCHKEN, PCRSEL}. + +* When set the PR_SHADOW_STACK_PUSH flag enables the functionality controlled + by GCSCRE0_EL1.PUSHMEn, allowing explicit GCS pushes. + +* When set the PR_SHADOW_STACK_WRITE flag enables the functionality controlled + by GCSCRE0_EL1.STREn, allowing explicit stores to the Guarded Control Stack. + +* Any unknown flags will cause PR_SET_SHADOW_STACK_STATUS to return -EINVAL. + +* PR_LOCK_SHADOW_STACK_STATUS is passed a bitmask of features with the same + values as used for PR_SET_SHADOW_STACK_STATUS. Any future changes to the + status of the specified GCS mode bits will be rejected. + +* PR_LOCK_SHADOW_STACK_STATUS allows any bit to be locked, this allows + userspace to prevent changes to any future features. + +* There is no support for a process to remove a lock that has been set for + it. + +* PR_SET_SHADOW_STACK_STATUS and PR_LOCK_SHADOW_STACK_STATUS affect only the + thread that called them, any other running threads will be unaffected. + +* New threads inherit the GCS configuration of the thread that created them. + +* GCS is disabled on exec(). + +* The current GCS configuration for a thread may be read with the + PR_GET_SHADOW_STACK_STATUS prctl(), this returns the same flags that + are passed to PR_SET_SHADOW_STACK_STATUS. + +* If GCS is disabled for a thread after having previously been enabled then + the stack will remain allocated for the lifetime of the thread. At present + any attempt to reenable GCS for the thread will be rejected, this may be + revisited in future. + +* It should be noted that since enabling GCS will result in GCS becoming + active immediately it is not normally possible to return from the function + that invoked the prctl() that enabled GCS. It is expected that the normal + usage will be that GCS is enabled very early in execution of a program. + + + +3. Allocation of Guarded Control Stacks +---------------------------------------- + +* When GCS is enabled for a thread a new Guarded Control Stack will be + allocated for it of half the standard stack size or 2 gigabytes, + whichever is smaller. + +* When a new thread is created by a thread which has GCS enabled then a + new Guarded Control Stack will be allocated for the new thread with + half the size of the standard stack. + +* When a stack is allocated by enabling GCS or during thread creation then + the top 8 bytes of the stack will be initialised to 0 and GCSPR_EL0 will + be set to point to the address of this 0 value, this can be used to + detect the top of the stack. + +* Additional Guarded Control Stacks can be allocated using the + map_shadow_stack() system call. + +* Stacks allocated using map_shadow_stack() can optionally have an end of + stack marker and cap placed at the top of the stack. If the flag + SHADOW_STACK_SET_TOKEN is specified a cap will be placed on the stack, + if SHADOW_STACK_SET_MARKER is not specified the cap will be the top 8 + bytes of the stack and if it is specified then the cap will be the next + 8 bytes. While specifying just SHADOW_STACK_SET_MARKER by itself is + valid since the marker is all bits 0 it has no observable effect. + +* Stacks allocated using map_shadow_stack() must have a size which is a + multiple of 8 bytes larger than 8 bytes and must be 8 bytes aligned. + +* An address can be specified to map_shadow_stack(), if one is provided then + it must be aligned to a page boundary. + +* When a thread is freed the Guarded Control Stack initially allocated for + that thread will be freed. Note carefully that if the stack has been + switched this may not be the stack currently in use by the thread. + + +4. Signal handling +-------------------- + +* A new signal frame record gcs_context encodes the current GCS mode and + pointer for the interrupted context on signal delivery. This will always + be present on systems that support GCS. + +* The record contains a flag field which reports the current GCS configuration + for the interrupted context as PR_GET_SHADOW_STACK_STATUS would. + +* The signal handler is run with the same GCS configuration as the interrupted + context. + +* When GCS is enabled for the interrupted thread a signal handling specific + GCS cap token will be written to the GCS, this is an architectural GCS cap + with the token type (bits 0..11) all clear. The GCSPR_EL0 reported in the + signal frame will point to this cap token. + +* The signal handler will use the same GCS as the interrupted context. + +* When GCS is enabled on signal entry a frame with the address of the signal + return handler will be pushed onto the GCS, allowing return from the signal + handler via RET as normal. This will not be reported in the gcs_context in + the signal frame. + + +5. Signal return +----------------- + +When returning from a signal handler: + +* If there is a gcs_context record in the signal frame then the GCS flags + and GCSPR_EL0 will be restored from that context prior to further + validation. + +* If there is no gcs_context record in the signal frame then the GCS + configuration will be unchanged. + +* If GCS is enabled on return from a signal handler then GCSPR_EL0 must + point to a valid GCS signal cap record, this will be popped from the + GCS prior to signal return. + +* If the GCS configuration is locked when returning from a signal then any + attempt to change the GCS configuration will be treated as an error. This + is true even if GCS was not enabled prior to signal entry. + +* GCS may be disabled via signal return but any attempt to enable GCS via + signal return will be rejected. + + +6. ptrace extensions +--------------------- + +* A new regset NT_ARM_GCS is defined for use with PTRACE_GETREGSET and + PTRACE_SETREGSET. + +* Due to the complexity surrounding allocation and deallocation of stacks and + lack of practical application it is not possible to enable GCS via ptrace. + GCS may be disabled via the ptrace interface. + +* Other GCS modes may be configured via ptrace. + +* Configuration via ptrace ignores locking of GCS mode bits. + + +7. ELF coredump extensions +--------------------------- + +* NT_ARM_GCS notes will be added to each coredump for each thread of the + dumped process. The contents will be equivalent to the data that would + have been read if a PTRACE_GETREGSET of the corresponding type were + executed for each thread when the coredump was generated. + + + +8. /proc extensions +-------------------- + +* Guarded Control Stack pages will include "ss" in their VmFlags in + /proc//smaps. diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index 78544de0a8a9..056f6a739d25 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -15,6 +15,7 @@ ARM64 Architecture cpu-feature-registers cpu-hotplug elf_hwcaps + gcs hugetlbpage kdump legacy_instructions From ce0641d48ddd240053138ce55c3423f833a4237b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:47 +0100 Subject: [PATCH 008/168] arm64/sysreg: Add definitions for architected GCS caps The architecture defines a format for guarded control stack caps, used to mark the top of an unused GCS in order to limit the potential for exploitation via stack switching. Add definitions associated with these. Reviewed-by: Thiago Jung Bauermann Acked-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-8-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9ea97dddefc4..9c98ff448bd9 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1101,6 +1101,26 @@ /* Initial value for Permission Overlay Extension for EL0 */ #define POR_EL0_INIT POE_RXW +/* + * Definitions for Guarded Control Stack + */ + +#define GCS_CAP_ADDR_MASK GENMASK(63, 12) +#define GCS_CAP_ADDR_SHIFT 12 +#define GCS_CAP_ADDR_WIDTH 52 +#define GCS_CAP_ADDR(x) FIELD_GET(GCS_CAP_ADDR_MASK, x) + +#define GCS_CAP_TOKEN_MASK GENMASK(11, 0) +#define GCS_CAP_TOKEN_SHIFT 0 +#define GCS_CAP_TOKEN_WIDTH 12 +#define GCS_CAP_TOKEN(x) FIELD_GET(GCS_CAP_TOKEN_MASK, x) + +#define GCS_CAP_VALID_TOKEN 0x1 +#define GCS_CAP_IN_PROGRESS_TOKEN 0x5 + +#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \ + GCS_CAP_VALID_TOKEN) + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ From dad947cc22cff28348d04e21fa4d6c882385fd7d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:48 +0100 Subject: [PATCH 009/168] arm64/gcs: Add manual encodings of GCS instructions Define C callable functions for GCS instructions used by the kernel. In order to avoid ambitious toolchain requirements for GCS support these are manually encoded, this means we have fixed register numbers which will be a bit limiting for the compiler but none of these should be used in sufficiently fast paths for this to be a problem. Note that GCSSTTR is used to store to EL0. Reviewed-by: Thiago Jung Bauermann Acked-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-9-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/gcs.h | 51 ++++++++++++++++++++++++++++++++ arch/arm64/include/asm/uaccess.h | 22 ++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 arch/arm64/include/asm/gcs.h diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h new file mode 100644 index 000000000000..7c5e95218db6 --- /dev/null +++ b/arch/arm64/include/asm/gcs.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Ltd. + */ +#ifndef __ASM_GCS_H +#define __ASM_GCS_H + +#include +#include + +static inline void gcsb_dsync(void) +{ + asm volatile(".inst 0xd503227f" : : : "memory"); +} + +static inline void gcsstr(u64 *addr, u64 val) +{ + register u64 *_addr __asm__ ("x0") = addr; + register long _val __asm__ ("x1") = val; + + /* GCSSTTR x1, x0 */ + asm volatile( + ".inst 0xd91f1c01\n" + : + : "rZ" (_val), "r" (_addr) + : "memory"); +} + +static inline void gcsss1(u64 Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static inline u64 gcsss2(void) +{ + u64 Xt; + + asm volatile( + "SYSL %0, #3, C7, C7, #3\n" + : "=r" (Xt) + : + : "memory"); + + return Xt; +} + +#endif diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 1aa4ecb73429..0db494b24dd0 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -502,4 +502,26 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr, #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ +#ifdef CONFIG_ARM64_GCS + +static inline int gcssttr(unsigned long __user *addr, unsigned long val) +{ + register unsigned long __user *_addr __asm__ ("x0") = addr; + register unsigned long _val __asm__ ("x1") = val; + int err = 0; + + /* GCSSTTR x1, x0 */ + asm volatile( + "1: .inst 0xd91f1c01\n" + "2: \n" + _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) + : "+r" (err) + : "rZ" (_val), "r" (_addr) + : "memory"); + + return err; +} + +#endif /* CONFIG_ARM64_GCS */ + #endif /* __ASM_UACCESS_H */ From d0aa2b4351862cc2ce8d97e00c96bffc02ea16af Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:49 +0100 Subject: [PATCH 010/168] arm64/gcs: Provide put_user_gcs() In order for EL1 to write to an EL0 GCS it must use the GCSSTTR instruction rather than a normal STTR. Provide a put_user_gcs() which does this. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-10-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/uaccess.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 0db494b24dd0..5b91803201ef 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -522,6 +522,24 @@ static inline int gcssttr(unsigned long __user *addr, unsigned long val) return err; } +static inline void put_user_gcs(unsigned long val, unsigned long __user *addr, + int *err) +{ + int ret; + + if (!access_ok((char __user *)addr, sizeof(u64))) { + *err = -EFAULT; + return; + } + + uaccess_ttbr0_enable(); + ret = gcssttr(addr, val); + if (ret != 0) + *err = ret; + uaccess_ttbr0_disable(); +} + + #endif /* CONFIG_ARM64_GCS */ #endif /* __ASM_UACCESS_H */ From ff5181d8a2a82c982276a7e035896185c390e856 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:50 +0100 Subject: [PATCH 011/168] arm64/gcs: Provide basic EL2 setup to allow GCS usage at EL0 and EL1 There is a control HCRX_EL2.GCSEn which must be set to allow GCS features to take effect at lower ELs and also fine grained traps for GCS usage at EL0 and EL1. Configure all these to allow GCS usage by EL0 and EL1. We also initialise GCSCR_EL1 and GCSCRE0_EL1 to ensure that we can execute function call instructions without faulting regardless of the state when the kernel is started. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-11-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/el2_setup.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index e0ffdf13a18b..27086a81eae3 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -27,6 +27,14 @@ ubfx x0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4 cbz x0, .Lskip_hcrx_\@ mov_q x0, HCRX_HOST_FLAGS + + /* Enable GCS if supported */ + mrs_s x1, SYS_ID_AA64PFR1_EL1 + ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 + cbz x1, .Lset_hcrx_\@ + orr x0, x0, #HCRX_EL2_GCSEn + +.Lset_hcrx_\@: msr_s SYS_HCRX_EL2, x0 .Lskip_hcrx_\@: .endm @@ -200,6 +208,16 @@ orr x0, x0, #HFGxTR_EL2_nPOR_EL0 .Lskip_poe_fgt_\@: + /* GCS depends on PIE so we don't check it if PIE is absent */ + mrs_s x1, SYS_ID_AA64PFR1_EL1 + ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 + cbz x1, .Lset_fgt_\@ + + /* Disable traps of access to GCS registers at EL0 and EL1 */ + orr x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK + orr x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK + +.Lset_fgt_\@: msr_s SYS_HFGRTR_EL2, x0 msr_s SYS_HFGWTR_EL2, x0 msr_s SYS_HFGITR_EL2, xzr @@ -215,6 +233,17 @@ .Lskip_fgt_\@: .endm +.macro __init_el2_gcs + mrs_s x1, SYS_ID_AA64PFR1_EL1 + ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 + cbz x1, .Lskip_gcs_\@ + + /* Ensure GCS is not enabled when we start trying to do BLs */ + msr_s SYS_GCSCR_EL1, xzr + msr_s SYS_GCSCRE0_EL1, xzr +.Lskip_gcs_\@: +.endm + .macro __init_el2_nvhe_prepare_eret mov x0, #INIT_PSTATE_EL1 msr spsr_el2, x0 @@ -240,6 +269,7 @@ __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt + __init_el2_gcs .endm #ifndef __KVM_NVHE_HYPERVISOR__ From 6487c963083c24ede289d4267ffa60a9db668cd4 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:51 +0100 Subject: [PATCH 012/168] arm64/cpufeature: Runtime detection of Guarded Control Stack (GCS) Add a cpufeature for GCS, allowing other code to conditionally support it at runtime. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-12-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/kernel/cpufeature.c | 20 ++++++++++++++++++++ arch/arm64/tools/cpucaps | 1 + 3 files changed, 27 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d261cc123c1..69470795f5d2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -838,6 +838,12 @@ static inline bool system_supports_poe(void) alternative_has_cap_unlikely(ARM64_HAS_S1POE); } +static inline bool system_supports_gcs(void) +{ + return IS_ENABLED(CONFIG_ARM64_GCS) && + alternative_has_cap_unlikely(ARM64_HAS_GCS); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..d1e758e99e0a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -291,6 +291,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0), @@ -2358,6 +2360,14 @@ static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused) } #endif +#ifdef CONFIG_ARM64_GCS +static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused) +{ + /* GCSPR_EL0 is always readable */ + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); +} +#endif + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2889,6 +2899,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_poe, ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP) }, +#endif +#ifdef CONFIG_ARM64_GCS + { + .desc = "Guarded Control Stack (GCS)", + .capability = ARM64_HAS_GCS, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .cpu_enable = cpu_enable_gcs, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP) + }, #endif {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index eedb5acc21ed..867d25d4a45a 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -29,6 +29,7 @@ HAS_EVT HAS_FPMR HAS_FGT HAS_FPSIMD +HAS_GCS HAS_GENERIC_AUTH HAS_GENERIC_AUTH_ARCH_QARMA3 HAS_GENERIC_AUTH_ARCH_QARMA5 From 092055f1508cce6f60d4927fe8a048d76bbad73e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:52 +0100 Subject: [PATCH 013/168] arm64/mm: Allocate PIE slots for EL0 guarded control stack Pages used for guarded control stacks need to be described to the hardware using the Permission Indirection Extension, GCS is not supported without PIE. In order to support copy on write for guarded stacks we allocate two values, one for active GCSs and one for GCS pages marked as read only prior to copy. Since the actual effect is defined using PIE the specific bit pattern used does not matter to the hardware but we choose two values which differ only in PTE_WRITE in order to help share code with non-PIE cases. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-13-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-prot.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2a11d0c10760..4e4bcd676f4c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -144,15 +144,23 @@ static inline bool __pure lpa2_is_enabled(void) /* 6: PTE_PXN | PTE_WRITE */ /* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */ /* 8: PAGE_KERNEL_ROX PTE_UXN */ -/* 9: PTE_UXN | PTE_USER */ +/* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */ /* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE */ -/* b: PTE_UXN | PTE_WRITE | PTE_USER */ +/* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */ /* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN */ /* d: PAGE_READONLY PTE_UXN | PTE_PXN | PTE_USER */ /* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE */ /* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */ +#define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) +#define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) + +#define PAGE_GCS __pgprot(_PAGE_GCS) +#define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) + #define PIE_E0 ( \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ @@ -160,6 +168,8 @@ static inline bool __pure lpa2_is_enabled(void) PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ + PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ From ae80e1629aeaf5be726a9ea94eb7345b1a44b00d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:53 +0100 Subject: [PATCH 014/168] mm: Define VM_SHADOW_STACK for arm64 when we support GCS Use VM_HIGH_ARCH_5 for guarded control stack pages. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-14-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- Documentation/filesystems/proc.rst | 2 +- include/linux/mm.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index e834779d9611..6a882c57a7e7 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -579,7 +579,7 @@ encoded manner. The codes are the following: mt arm64 MTE allocation tags are enabled um userfaultfd missing tracking uw userfaultfd wr-protect tracking - ss shadow stack page + ss shadow/guarded control stack page sl sealed == ======================================= diff --git a/include/linux/mm.h b/include/linux/mm.h index 56654306a832..8852c39c7695 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -367,7 +367,17 @@ extern unsigned int kobjsize(const void *objp); * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#else +#endif + +#if defined(CONFIG_ARM64_GCS) +/* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ +# define VM_SHADOW_STACK VM_HIGH_ARCH_6 +#endif + +#ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif From 6497b66ba6945f142902c7e8fce86e47016ead1c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:54 +0100 Subject: [PATCH 015/168] arm64/mm: Map pages for guarded control stack Map pages flagged as being part of a GCS as such rather than using the full set of generic VM flags. This is done using a conditional rather than extending the size of protection_map since that would make for a very sparse array. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-15-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mman.h | 9 +++++++++ arch/arm64/mm/mmap.c | 9 ++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 03b790fd0ad8..f6d784f8e6e0 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -71,6 +71,15 @@ static inline bool arch_validate_flags(unsigned long vm_flags) return false; } + if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { + /* An executable GCS isn't a good idea. */ + if (vm_flags & VM_EXEC) + return false; + + /* The memory management core should prevent this */ + VM_WARN_ON(vm_flags & VM_SHARED); + } + return true; } diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 7e3ad97e27d8..07aeab8a7606 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -83,8 +83,15 @@ arch_initcall(adjust_protection_map); pgprot_t vm_get_page_prot(unsigned long vm_flags) { - pteval_t prot = pgprot_val(protection_map[vm_flags & + pteval_t prot; + + /* Short circuit GCS to avoid bloating the table. */ + if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { + prot = _PAGE_GCS_RO; + } else { + prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + } if (vm_flags & VM_ARM64_BTI) prot |= PTE_GP; From a94452112ce4020af073af368d7c25a07bfabf37 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:56 +0100 Subject: [PATCH 016/168] arm64/idreg: Add overrride for GCS Hook up an override for GCS, allowing it to be disabled from the command line by specifying arm64.nogcs in case there are problems. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-17-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/arm64/kernel/pi/idreg-override.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..c1b00f709734 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -446,6 +446,9 @@ arm64.nobti [ARM64] Unconditionally disable Branch Target Identification support + arm64.nogcs [ARM64] Unconditionally disable Guarded Control Stack + support + arm64.nomops [ARM64] Unconditionally disable Memory Copy and Memory Set instructions support diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index 29d4b6244a6f..2bb709d78405 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -133,6 +133,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { .override = &id_aa64pfr1_override, .fields = { FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ), + FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), {} @@ -215,6 +216,7 @@ static const struct { { "arm64.nosve", "id_aa64pfr0.sve=0" }, { "arm64.nosme", "id_aa64pfr1.sme=0" }, { "arm64.nobti", "id_aa64pfr1.bt=0" }, + { "arm64.nogcs", "id_aa64pfr1.gcs=0" }, { "arm64.nopauth", "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 " "id_aa64isar1.api=0 id_aa64isar1.apa=0 " From eefc98711f84abd7ffb074af0cdbc5f8a8464272 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:57 +0100 Subject: [PATCH 017/168] arm64/hwcap: Add hwcap for GCS Provide a hwcap to enable userspace to detect support for GCS. Signed-off-by: Mark Brown Acked-by: Yury Khrustalev Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-18-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/elf_hwcaps.rst | 4 ++++ arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 3 ++- arch/arm64/kernel/cpufeature.c | 3 +++ arch/arm64/kernel/cpuinfo.c | 1 + 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index 694f67fa07d1..25b41ff74fa0 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -170,6 +170,10 @@ HWCAP_PACG ID_AA64ISAR1_EL1.GPI == 0b0001, as described by Documentation/arch/arm64/pointer-authentication.rst. +HWCAP_GCS + Functionality implied by ID_AA64PFR1_EL1.GCS == 0b1, as + described by Documentation/arch/arm64/gcs.rst. + HWCAP2_DCPODP Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010. diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index a775adddecf2..7bcf1347ca0b 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -92,6 +92,7 @@ #define KERNEL_HWCAP_SB __khwcap_feature(SB) #define KERNEL_HWCAP_PACA __khwcap_feature(PACA) #define KERNEL_HWCAP_PACG __khwcap_feature(PACG) +#define KERNEL_HWCAP_GCS __khwcap_feature(GCS) #define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64) #define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP) diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 055381b2c615..675642ec4d91 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -21,7 +21,7 @@ * HWCAP flags - for AT_HWCAP * * Bits 62 and 63 are reserved for use by libc. - * Bits 32-61 are unallocated for potential use by libc. + * Bits 33-61 are unallocated for potential use by libc. */ #define HWCAP_FP (1 << 0) #define HWCAP_ASIMD (1 << 1) @@ -55,6 +55,7 @@ #define HWCAP_SB (1 << 29) #define HWCAP_PACA (1 << 30) #define HWCAP_PACG (1UL << 31) +#define HWCAP_GCS (1UL << 32) /* * HWCAP2 flags - for AT_HWCAP2 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d1e758e99e0a..b8655d55f318 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -3025,6 +3025,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM), HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM), HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM), +#endif +#ifdef CONFIG_ARM64_GCS + HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS), #endif HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_BTI diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 44718d0482b3..f2f92c6b1c85 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -80,6 +80,7 @@ static const char *const hwcap_str[] = { [KERNEL_HWCAP_SB] = "sb", [KERNEL_HWCAP_PACA] = "paca", [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_GCS] = "gcs", [KERNEL_HWCAP_DCPODP] = "dcpodp", [KERNEL_HWCAP_SVE2] = "sve2", [KERNEL_HWCAP_SVEAES] = "sveaes", From 8ce71d270536dd7a48698a2b18ddf13f2d5007fb Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:58 +0100 Subject: [PATCH 018/168] arm64/traps: Handle GCS exceptions A new exception code is defined for GCS specific faults other than standard load/store faults, for example GCS token validation failures, add handling for this. These faults are reported to userspace as segfaults with code SEGV_CPERR (protection error), mirroring the reporting for x86 shadow stack errors. GCS faults due to memory load/store operations generate data aborts with a flag set, these will be handled separately as part of the data abort handling. Since we do not currently enable GCS for EL1 we should not get any faults there but while we're at it we wire things up there, treating any GCS fault as fatal. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-19-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/esr.h | 28 +++++++++++++++++++++++++++- arch/arm64/include/asm/exception.h | 2 ++ arch/arm64/kernel/entry-common.c | 23 +++++++++++++++++++++++ arch/arm64/kernel/traps.c | 11 +++++++++++ 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index da6d2c1c0b03..d1b1a33f9a8b 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -51,7 +51,8 @@ #define ESR_ELx_EC_FP_EXC32 UL(0x28) /* Unallocated EC: 0x29 - 0x2B */ #define ESR_ELx_EC_FP_EXC64 UL(0x2C) -/* Unallocated EC: 0x2D - 0x2E */ +#define ESR_ELx_EC_GCS UL(0x2D) +/* Unallocated EC: 0x2E */ #define ESR_ELx_EC_SERROR UL(0x2F) #define ESR_ELx_EC_BREAKPT_LOW UL(0x30) #define ESR_ELx_EC_BREAKPT_CUR UL(0x31) @@ -386,6 +387,31 @@ #define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5) #define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0) +/* ISS field definitions for GCS */ +#define ESR_ELx_ExType_SHIFT (20) +#define ESR_ELx_ExType_MASK GENMASK(23, 20) +#define ESR_ELx_Raddr_SHIFT (10) +#define ESR_ELx_Raddr_MASK GENMASK(14, 10) +#define ESR_ELx_Rn_SHIFT (5) +#define ESR_ELx_Rn_MASK GENMASK(9, 5) +#define ESR_ELx_Rvalue_SHIFT 5 +#define ESR_ELx_Rvalue_MASK GENMASK(9, 5) +#define ESR_ELx_IT_SHIFT (0) +#define ESR_ELx_IT_MASK GENMASK(4, 0) + +#define ESR_ELx_ExType_DATA_CHECK 0 +#define ESR_ELx_ExType_EXLOCK 1 +#define ESR_ELx_ExType_STR 2 + +#define ESR_ELx_IT_RET 0 +#define ESR_ELx_IT_GCSPOPM 1 +#define ESR_ELx_IT_RET_KEYA 2 +#define ESR_ELx_IT_RET_KEYB 3 +#define ESR_ELx_IT_GCSSS1 4 +#define ESR_ELx_IT_GCSSS2 5 +#define ESR_ELx_IT_GCSPOPCX 6 +#define ESR_ELx_IT_GCSPOPX 7 + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index f296662590c7..674518464718 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -57,6 +57,8 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr); void do_el1_undef(struct pt_regs *regs, unsigned long esr); void do_el0_bti(struct pt_regs *regs); void do_el1_bti(struct pt_regs *regs, unsigned long esr); +void do_el0_gcs(struct pt_regs *regs, unsigned long esr); +void do_el1_gcs(struct pt_regs *regs, unsigned long esr); void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr, struct pt_regs *regs); void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs); diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 3fcd9d080bf2..fe74813009bd 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -463,6 +463,15 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) exit_to_kernel_mode(regs); } +static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_gcs(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -505,6 +514,9 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BTI: el1_bti(regs, esr); break; + case ESR_ELx_EC_GCS: + el1_gcs(regs, esr); + break; case ESR_ELx_EC_BREAKPT_CUR: case ESR_ELx_EC_SOFTSTP_CUR: case ESR_ELx_EC_WATCHPT_CUR: @@ -684,6 +696,14 @@ static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr) exit_to_user_mode(regs); } +static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + enter_from_user_mode(regs); + local_daif_restore(DAIF_PROCCTX); + do_el0_gcs(regs, esr); + exit_to_user_mode(regs); +} + static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { enter_from_user_mode(regs); @@ -766,6 +786,9 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_MOPS: el0_mops(regs, esr); break; + case ESR_ELx_EC_GCS: + el0_gcs(regs, esr); + break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_WATCHPT_LOW: diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 563cbce11126..fdbcf047108c 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -506,6 +506,16 @@ void do_el1_bti(struct pt_regs *regs, unsigned long esr) die("Oops - BTI", regs, esr); } +void do_el0_gcs(struct pt_regs *regs, unsigned long esr) +{ + force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0); +} + +void do_el1_gcs(struct pt_regs *regs, unsigned long esr) +{ + die("Oops - GCS", regs, esr); +} + void do_el0_fpac(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); @@ -852,6 +862,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", + [ESR_ELx_EC_GCS] = "Guarded Control Stack", [ESR_ELx_EC_SERROR] = "SError", [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)", [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)", From cfad706e8f6ded5cec69f820bceeca9e64394592 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:58:59 +0100 Subject: [PATCH 019/168] arm64/mm: Handle GCS data aborts All GCS operations at EL0 must happen on a page which is marked as having UnprivGCS access, including read operations. If a GCS operation attempts to access a page without this then it will generate a data abort with the GCS bit set in ESR_EL1.ISS2. EL0 may validly generate such faults, for example due to copy on write which will cause the GCS data to be stored in a read only page with no GCS permissions until the actual copy happens. Since UnprivGCS allows both reads and writes to the GCS (though only through GCS operations) we need to ensure that the memory management subsystem handles GCS accesses as writes at all times. Do this by adding FAULT_FLAG_WRITE to any GCS page faults, adding handling to ensure that invalid cases are identfied as such early so the memory management core does not think they will succeed. The core cannot distinguish between VMAs which are generally writeable and VMAs which are only writeable through GCS operations. EL1 may validly write to EL0 GCS for management purposes (eg, while initialising with cap tokens). We also report any GCS faults in VMAs not marked as part of a GCS as access violations, causing a fault to be delivered to userspace if it attempts to do GCS operations outside a GCS. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-20-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 8b281cf308b3..c2f89a678ac0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -504,6 +504,14 @@ static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma, false); } +static bool is_gcs_fault(unsigned long esr) +{ + if (!esr_is_data_abort(esr)) + return false; + + return ESR_ELx_ISS2(esr) & ESR_ELx_GCS; +} + static bool is_el0_instruction_abort(unsigned long esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -518,6 +526,23 @@ static bool is_write_abort(unsigned long esr) return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } +static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr) +{ + if (!system_supports_gcs()) + return false; + + if (unlikely(is_gcs_fault(esr))) { + /* GCS accesses must be performed on a GCS page */ + if (!(vma->vm_flags & VM_SHADOW_STACK)) + return true; + } else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) { + /* Only GCS operations can write to a GCS page */ + return esr_is_data_abort(esr) && is_write_abort(esr); + } + + return false; +} + static int __kprobes do_page_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { @@ -554,6 +579,14 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, /* It was exec fault */ vm_flags = VM_EXEC; mm_flags |= FAULT_FLAG_INSTRUCTION; + } else if (is_gcs_fault(esr)) { + /* + * The GCS permission on a page implies both read and + * write so always handle any GCS fault as a write fault, + * we need to trigger CoW even for GCS reads. + */ + vm_flags = VM_WRITE; + mm_flags |= FAULT_FLAG_WRITE; } else if (is_write_abort(esr)) { /* It was write fault */ vm_flags = VM_WRITE; @@ -587,6 +620,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, if (!vma) goto lock_mmap; + if (is_invalid_gcs_access(vma, esr)) { + vma_end_read(vma); + fault = 0; + si_code = SEGV_ACCERR; + goto bad_area; + } + if (!(vma->vm_flags & vm_flags)) { vma_end_read(vma); fault = 0; From fc84bc5378a8c852308fa022957b8976adb5aa6a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:00 +0100 Subject: [PATCH 020/168] arm64/gcs: Context switch GCS state for EL0 There are two registers controlling the GCS state of EL0, GCSPR_EL0 which is the current GCS pointer and GCSCRE0_EL1 which has enable bits for the specific GCS functionality enabled for EL0. Manage these on context switch and process lifetime events, GCS is reset on exec(). Also ensure that any changes to the GCS memory are visible to other PEs and that changes from other PEs are visible on this one by issuing a GCSB DSYNC when moving to or from a thread with GCS. Since the current GCS configuration of a thread will be visible to userspace we store the configuration in the format used with userspace and provide a helper which configures the system register as needed. On systems that support GCS we always allow access to GCSPR_EL0, this facilitates reporting of GCS faults if userspace implements disabling of GCS on error - the GCS can still be discovered and examined even if GCS has been disabled. Reviewed-by: Catalin Marinas Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-21-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/gcs.h | 24 ++++++++++++ arch/arm64/include/asm/processor.h | 6 +++ arch/arm64/kernel/process.c | 62 ++++++++++++++++++++++++++++++ arch/arm64/mm/Makefile | 1 + arch/arm64/mm/gcs.c | 42 ++++++++++++++++++++ 5 files changed, 135 insertions(+) create mode 100644 arch/arm64/mm/gcs.c diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 7c5e95218db6..04594ef59dad 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -48,4 +48,28 @@ static inline u64 gcsss2(void) return Xt; } +#ifdef CONFIG_ARM64_GCS + +static inline bool task_gcs_el0_enabled(struct task_struct *task) +{ + return current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE; +} + +void gcs_set_el0_mode(struct task_struct *task); +void gcs_free(struct task_struct *task); +void gcs_preserve_current_state(void); + +#else + +static inline bool task_gcs_el0_enabled(struct task_struct *task) +{ + return false; +} + +static inline void gcs_set_el0_mode(struct task_struct *task) { } +static inline void gcs_free(struct task_struct *task) { } +static inline void gcs_preserve_current_state(void) { } + +#endif + #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 1438424f0064..5260788247d8 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -185,6 +185,12 @@ struct thread_struct { u64 svcr; u64 tpidr2_el0; u64 por_el0; +#ifdef CONFIG_ARM64_GCS + unsigned int gcs_el0_mode; + u64 gcspr_el0; + u64 gcs_base; + u64 gcs_size; +#endif }; static inline unsigned int thread_get_vl(struct thread_struct *thread, diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 0540653fbf38..aedcf332f422 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -280,6 +281,25 @@ static void flush_poe(void) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); } +#ifdef CONFIG_ARM64_GCS + +static void flush_gcs(void) +{ + if (!system_supports_gcs()) + return; + + gcs_free(current); + current->thread.gcs_el0_mode = 0; + write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); + write_sysreg_s(0, SYS_GCSPR_EL0); +} + +#else + +static void flush_gcs(void) { } + +#endif + void flush_thread(void) { fpsimd_flush_thread(); @@ -287,6 +307,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); flush_poe(); + flush_gcs(); } void arch_release_task_struct(struct task_struct *tsk) @@ -484,6 +505,46 @@ static void entry_task_switch(struct task_struct *next) __this_cpu_write(__entry_task, next); } +#ifdef CONFIG_ARM64_GCS + +void gcs_preserve_current_state(void) +{ + current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); +} + +static void gcs_thread_switch(struct task_struct *next) +{ + if (!system_supports_gcs()) + return; + + /* GCSPR_EL0 is always readable */ + gcs_preserve_current_state(); + write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0); + + if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode) + gcs_set_el0_mode(next); + + /* + * Ensure that GCS memory effects of the 'prev' thread are + * ordered before other memory accesses with release semantics + * (or preceded by a DMB) on the current PE. In addition, any + * memory accesses with acquire semantics (or succeeded by a + * DMB) are ordered before GCS memory effects of the 'next' + * thread. This will ensure that the GCS memory effects are + * visible to other PEs in case of migration. + */ + if (task_gcs_el0_enabled(current) || task_gcs_el0_enabled(next)) + gcsb_dsync(); +} + +#else + +static void gcs_thread_switch(struct task_struct *next) +{ +} + +#endif + /* * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0} @@ -580,6 +641,7 @@ struct task_struct *__switch_to(struct task_struct *prev, cntkctl_thread_switch(prev, next); ptrauth_thread_switch_user(next); permission_overlay_switch(next); + gcs_thread_switch(next); /* * Complete any pending TLB or cache maintenance on this CPU in case diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 2fc8c6dd0407..fc92170a8f37 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o obj-$(CONFIG_TRANS_TABLE) += trans_pgd-asm.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ARM64_MTE) += mteswap.o +obj-$(CONFIG_ARM64_GCS) += gcs.o KASAN_SANITIZE_physaddr.o += n obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c new file mode 100644 index 000000000000..f8f4f984a247 --- /dev/null +++ b/arch/arm64/mm/gcs.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include + +/* + * Apply the GCS mode configured for the specified task to the + * hardware. + */ +void gcs_set_el0_mode(struct task_struct *task) +{ + u64 gcscre0_el1 = GCSCRE0_EL1_nTR; + + if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE) + gcscre0_el1 |= GCSCRE0_EL1_RVCHKEN | GCSCRE0_EL1_PCRSEL; + + if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_WRITE) + gcscre0_el1 |= GCSCRE0_EL1_STREn; + + if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_PUSH) + gcscre0_el1 |= GCSCRE0_EL1_PUSHMEn; + + write_sysreg_s(gcscre0_el1, SYS_GCSCRE0_EL1); +} + +void gcs_free(struct task_struct *task) +{ + if (!system_supports_gcs()) + return; + + if (task->thread.gcs_base) + vm_munmap(task->thread.gcs_base, task->thread.gcs_size); + + task->thread.gcspr_el0 = 0; + task->thread.gcs_base = 0; + task->thread.gcs_size = 0; +} From 506496bcbb4204c9ff5cfe82b1b90e1f14366992 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:01 +0100 Subject: [PATCH 021/168] arm64/gcs: Ensure that new threads have a GCS When a new thread is created by a thread with GCS enabled the GCS needs to be specified along with the regular stack. Unfortunately plain clone() is not extensible and existing clone3() users will not specify a stack so all existing code would be broken if we mandated specifying the stack explicitly. For compatibility with these cases and also x86 (which did not initially implement clone3() support for shadow stacks) if no GCS is specified we will allocate one so when a thread is created which has GCS enabled allocate one for it. We follow the extensively discussed x86 implementation and allocate min(RLIMIT_STACK/2, 2G). Since the GCS only stores the call stack and not any variables this should be more than sufficient for most applications. GCSs allocated via this mechanism will be freed when the thread exits. Reviewed-by: Thiago Jung Bauermann Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-22-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/gcs.h | 9 ++++ arch/arm64/include/asm/mmu_context.h | 9 ++++ arch/arm64/kernel/process.c | 32 +++++++++++++ arch/arm64/mm/gcs.c | 69 ++++++++++++++++++++++++++++ 4 files changed, 119 insertions(+) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 04594ef59dad..c1f274fdb9c0 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -8,6 +8,8 @@ #include #include +struct kernel_clone_args; + static inline void gcsb_dsync(void) { asm volatile(".inst 0xd503227f" : : : "memory"); @@ -58,6 +60,8 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) void gcs_set_el0_mode(struct task_struct *task); void gcs_free(struct task_struct *task); void gcs_preserve_current_state(void); +unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args); #else @@ -69,6 +73,11 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task) static inline void gcs_set_el0_mode(struct task_struct *task) { } static inline void gcs_free(struct task_struct *task) { } static inline void gcs_preserve_current_state(void) { } +static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args) +{ + return -ENOTSUPP; +} #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 7c09d47e09cb..48b3d9553b67 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -311,6 +312,14 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return por_el0_allows_pkey(vma_pkey(vma), write, execute); } +#define deactivate_mm deactivate_mm +static inline void deactivate_mm(struct task_struct *tsk, + struct mm_struct *mm) +{ + gcs_free(tsk); +} + + #include #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index aedcf332f422..fdd095480c3f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -294,9 +294,35 @@ static void flush_gcs(void) write_sysreg_s(0, SYS_GCSPR_EL0); } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + unsigned long gcs; + + if (!system_supports_gcs()) + return 0; + + p->thread.gcs_base = 0; + p->thread.gcs_size = 0; + + gcs = gcs_alloc_thread_stack(p, args); + if (IS_ERR_VALUE(gcs)) + return PTR_ERR((void *)gcs); + + p->thread.gcs_el0_mode = current->thread.gcs_el0_mode; + p->thread.gcs_el0_locked = current->thread.gcs_el0_locked; + + return 0; +} + #else static void flush_gcs(void) { } +static int copy_thread_gcs(struct task_struct *p, + const struct kernel_clone_args *args) +{ + return 0; +} #endif @@ -313,6 +339,7 @@ void flush_thread(void) void arch_release_task_struct(struct task_struct *tsk) { fpsimd_release_task(tsk); + gcs_free(tsk); } int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) @@ -376,6 +403,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) unsigned long stack_start = args->stack; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); + int ret; memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); @@ -420,6 +448,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) p->thread.uw.tp_value = tls; p->thread.tpidr2_el0 = 0; } + + ret = copy_thread_gcs(p, args); + if (ret != 0) + return ret; } else { /* * A kthread has no context to ERET to, so ensure any buggy diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index f8f4f984a247..3c7a18f57ea9 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -5,9 +5,69 @@ #include #include +#include #include +#include #include +static unsigned long alloc_gcs(unsigned long addr, unsigned long size) +{ + int flags = MAP_ANONYMOUS | MAP_PRIVATE; + struct mm_struct *mm = current->mm; + unsigned long mapped_addr, unused; + + if (addr) + flags |= MAP_FIXED_NOREPLACE; + + mmap_write_lock(mm); + mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, + VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); + mmap_write_unlock(mm); + + return mapped_addr; +} + +static unsigned long gcs_size(unsigned long size) +{ + if (size) + return PAGE_ALIGN(size); + + /* Allocate RLIMIT_STACK/2 with limits of PAGE_SIZE..2G */ + size = PAGE_ALIGN(min_t(unsigned long long, + rlimit(RLIMIT_STACK) / 2, SZ_2G)); + return max(PAGE_SIZE, size); +} + +unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, + const struct kernel_clone_args *args) +{ + unsigned long addr, size; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(tsk)) + return 0; + + if ((args->flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) { + tsk->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); + return 0; + } + + size = args->stack_size / 2; + + size = gcs_size(size); + addr = alloc_gcs(0, size); + if (IS_ERR_VALUE(addr)) + return addr; + + tsk->thread.gcs_base = addr; + tsk->thread.gcs_size = size; + tsk->thread.gcspr_el0 = addr + size - sizeof(u64); + + return addr; +} + /* * Apply the GCS mode configured for the specified task to the * hardware. @@ -33,6 +93,15 @@ void gcs_free(struct task_struct *task) if (!system_supports_gcs()) return; + /* + * When fork() with CLONE_VM fails, the child (tsk) already + * has a GCS allocated, and exit_thread() calls this function + * to free it. In this case the parent (current) and the + * child share the same mm struct. + */ + if (!task->mm || task->mm != current->mm) + return; + if (task->thread.gcs_base) vm_munmap(task->thread.gcs_base, task->thread.gcs_size); From b57180c75c7ebff6613886cb69ef6e283a10358b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:02 +0100 Subject: [PATCH 022/168] arm64/gcs: Implement shadow stack prctl() interface Implement the architecture neutral prctl() interface for setting the shadow stack status, this supports setting and reading the current GCS configuration for the current thread. Userspace can enable basic GCS functionality and additionally also support for GCS pushes and arbitrary GCS stores. It is expected that this prctl() will be called very early in application startup, for example by the dynamic linker, and not subsequently adjusted during normal operation. Users should carefully note that after enabling GCS for a thread GCS will become active with no call stack so it is not normally possible to return from the function that invoked the prctl(). State is stored per thread, enabling GCS for a thread causes a GCS to be allocated for that thread. Userspace may lock the current GCS configuration by specifying PR_SHADOW_STACK_ENABLE_LOCK, this prevents any further changes to the GCS configuration via any means. If GCS is not being enabled then all flags other than _LOCK are ignored, it is not possible to enable stores or pops without enabling GCS. When disabling the GCS we do not free the allocated stack, this allows for inspection of the GCS after disabling as part of fault reporting. Since it is not an expected use case and since it presents some complications in determining what to do with previously initialsed data on the GCS attempts to reenable GCS after this are rejected. This can be revisted if a use case arises. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-23-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/gcs.h | 22 +++++++++ arch/arm64/include/asm/processor.h | 1 + arch/arm64/mm/gcs.c | 79 ++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index c1f274fdb9c0..48c97e63e56a 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -50,6 +50,9 @@ static inline u64 gcsss2(void) return Xt; } +#define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK \ + (PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH) + #ifdef CONFIG_ARM64_GCS static inline bool task_gcs_el0_enabled(struct task_struct *task) @@ -63,6 +66,20 @@ void gcs_preserve_current_state(void); unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, const struct kernel_clone_args *args); +static inline int gcs_check_locked(struct task_struct *task, + unsigned long new_val) +{ + unsigned long cur_val = task->thread.gcs_el0_mode; + + cur_val &= task->thread.gcs_el0_locked; + new_val &= task->thread.gcs_el0_locked; + + if (cur_val != new_val) + return -EBUSY; + + return 0; +} + #else static inline bool task_gcs_el0_enabled(struct task_struct *task) @@ -78,6 +95,11 @@ static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, { return -ENOTSUPP; } +static inline int gcs_check_locked(struct task_struct *task, + unsigned long new_val) +{ + return 0; +} #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 5260788247d8..37fefdc3d3a3 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -187,6 +187,7 @@ struct thread_struct { u64 por_el0; #ifdef CONFIG_ARM64_GCS unsigned int gcs_el0_mode; + unsigned int gcs_el0_locked; u64 gcspr_el0; u64 gcs_base; u64 gcs_size; diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index 3c7a18f57ea9..61a80de6baf8 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -109,3 +109,82 @@ void gcs_free(struct task_struct *task) task->thread.gcs_base = 0; task->thread.gcs_size = 0; } + +int arch_set_shadow_stack_status(struct task_struct *task, unsigned long arg) +{ + unsigned long gcs, size; + int ret; + + if (!system_supports_gcs()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(task))) + return -EINVAL; + + /* Reject unknown flags */ + if (arg & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + ret = gcs_check_locked(task, arg); + if (ret != 0) + return ret; + + /* If we are enabling GCS then make sure we have a stack */ + if (arg & PR_SHADOW_STACK_ENABLE && + !task_gcs_el0_enabled(task)) { + /* Do not allow GCS to be reenabled */ + if (task->thread.gcs_base || task->thread.gcspr_el0) + return -EINVAL; + + if (task != current) + return -EBUSY; + + size = gcs_size(0); + gcs = alloc_gcs(0, size); + if (!gcs) + return -ENOMEM; + + task->thread.gcspr_el0 = gcs + size - sizeof(u64); + task->thread.gcs_base = gcs; + task->thread.gcs_size = size; + if (task == current) + write_sysreg_s(task->thread.gcspr_el0, + SYS_GCSPR_EL0); + } + + task->thread.gcs_el0_mode = arg; + if (task == current) + gcs_set_el0_mode(task); + + return 0; +} + +int arch_get_shadow_stack_status(struct task_struct *task, + unsigned long __user *arg) +{ + if (!system_supports_gcs()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(task))) + return -EINVAL; + + return put_user(task->thread.gcs_el0_mode, arg); +} + +int arch_lock_shadow_stack_status(struct task_struct *task, + unsigned long arg) +{ + if (!system_supports_gcs()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(task))) + return -EINVAL; + + /* + * We support locking unknown bits so applications can prevent + * any changes in a future proof manner. + */ + task->thread.gcs_el0_locked |= arg; + + return 0; +} From 8f3e750673b21ff0613af8b02028200199f3144c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:03 +0100 Subject: [PATCH 023/168] arm64/mm: Implement map_shadow_stack() As discussed extensively in the changelog for the addition of this syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the existing mmap() and madvise() syscalls do not map entirely well onto the security requirements for guarded control stacks since they lead to windows where memory is allocated but not yet protected or stacks which are not properly and safely initialised. Instead a new syscall map_shadow_stack() has been defined which allocates and initialises a shadow stack page. Implement this for arm64. Two flags are provided, allowing applications to request that the stack be initialised with a valid cap token at the top of the stack and optionally also an end of stack marker above that. We support requesting an end of stack marker alone but since this is a NULL pointer it is indistinguishable from not initialising anything by itself. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-24-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/mm/gcs.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c index 61a80de6baf8..5c46ec527b1c 100644 --- a/arch/arm64/mm/gcs.c +++ b/arch/arm64/mm/gcs.c @@ -68,6 +68,70 @@ unsigned long gcs_alloc_thread_stack(struct task_struct *tsk, return addr; } +SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) +{ + unsigned long alloc_size; + unsigned long __user *cap_ptr; + unsigned long cap_val; + int ret = 0; + int cap_offset; + + if (!system_supports_gcs()) + return -EOPNOTSUPP; + + if (flags & ~(SHADOW_STACK_SET_TOKEN | SHADOW_STACK_SET_MARKER)) + return -EINVAL; + + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + if (size == 8 || !IS_ALIGNED(size, 8)) + return -EINVAL; + + /* + * An overflow would result in attempting to write the restore token + * to the wrong location. Not catastrophic, but just return the right + * error code and block it. + */ + alloc_size = PAGE_ALIGN(size); + if (alloc_size < size) + return -EOVERFLOW; + + addr = alloc_gcs(addr, alloc_size); + if (IS_ERR_VALUE(addr)) + return addr; + + /* + * Put a cap token at the end of the allocated region so it + * can be switched to. + */ + if (flags & SHADOW_STACK_SET_TOKEN) { + /* Leave an extra empty frame as a top of stack marker? */ + if (flags & SHADOW_STACK_SET_MARKER) + cap_offset = 2; + else + cap_offset = 1; + + cap_ptr = (unsigned long __user *)(addr + size - + (cap_offset * sizeof(unsigned long))); + cap_val = GCS_CAP(cap_ptr); + + put_user_gcs(cap_val, cap_ptr, &ret); + if (ret != 0) { + vm_munmap(addr, size); + return -EFAULT; + } + + /* + * Ensure the new cap is ordered before standard + * memory accesses to the same location. + */ + gcsb_dsync(); + } + + return addr; +} + /* * Apply the GCS mode configured for the specified task to the * hardware. From eaf62ce1563b8557e3550acb97d5086120168750 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:04 +0100 Subject: [PATCH 024/168] arm64/signal: Set up and restore the GCS context for signal handlers When invoking a signal handler we use the GCS configuration and stack for the current thread. Since we implement signal return by calling the signal handler with a return address set up pointing to a trampoline in the vDSO we need to also configure any active GCS for this by pushing a frame for the trampoline onto the GCS. If we do not do this then signal return will generate a GCS protection fault. In order to guard against attempts to bypass GCS protections via signal return we only allow returning with GCSPR_EL0 pointing to an address where it was previously preempted by a signal. We do this by pushing a cap onto the GCS, this takes the form of an architectural GCS cap token with the top bit set and token type of 0 which we add on signal entry and validate and pop off on signal return. The combination of the top bit being set and the token type mean that this can't be interpreted as a valid token or address. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-25-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/gcs.h | 1 + arch/arm64/kernel/signal.c | 118 +++++++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h index 48c97e63e56a..f50660603ecf 100644 --- a/arch/arm64/include/asm/gcs.h +++ b/arch/arm64/include/asm/gcs.h @@ -9,6 +9,7 @@ #include struct kernel_clone_args; +struct ksignal; static inline void gcsb_dsync(void) { diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 561986947530..b5ab0e229a78 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,15 @@ #include #include +#ifdef CONFIG_ARM64_GCS +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + +static bool gcs_signal_cap_valid(u64 addr, u64 val) +{ + return val == GCS_SIGNAL_CAP(addr); +} +#endif + /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ @@ -904,6 +914,58 @@ static int restore_sigframe(struct pt_regs *regs, return err; } +#ifdef CONFIG_ARM64_GCS +static int gcs_restore_signal(void) +{ + unsigned long __user *gcspr_el0; + u64 cap; + int ret; + + if (!system_supports_gcs()) + return 0; + + if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) + return 0; + + gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Ensure that any changes to the GCS done via GCS operations + * are visible to the normal reads we do to validate the + * token. + */ + gcsb_dsync(); + + /* + * GCSPR_EL0 should be pointing at a capped GCS, read the cap. + * We don't enforce that this is in a GCS page, if it is not + * then faults will be generated on GCS operations - the main + * concern is to protect GCS pages. + */ + ret = copy_from_user(&cap, gcspr_el0, sizeof(cap)); + if (ret) + return -EFAULT; + + /* + * Check that the cap is the actual GCS before replacing it. + */ + if (!gcs_signal_cap_valid((u64)gcspr_el0, cap)) + return -EINVAL; + + /* Invalidate the token to prevent reuse */ + put_user_gcs(0, (__user void*)gcspr_el0, &ret); + if (ret != 0) + return -EFAULT; + + write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0); + + return 0; +} + +#else +static int gcs_restore_signal(void) { return 0; } +#endif + SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); @@ -927,6 +989,9 @@ SYSCALL_DEFINE0(rt_sigreturn) if (restore_sigframe(regs, frame)) goto badframe; + if (gcs_restore_signal()) + goto badframe; + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; @@ -1189,7 +1254,48 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +#ifdef CONFIG_ARM64_GCS + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + unsigned long __user *gcspr_el0; + int ret = 0; + + if (!system_supports_gcs()) + return 0; + + if (!task_gcs_el0_enabled(current)) + return 0; + + /* + * We are entering a signal handler, current register state is + * active. + */ + gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0); + + /* + * Push a cap and the GCS entry for the trampoline onto the GCS. + */ + put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret); + put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret); + if (ret != 0) + return ret; + + gcspr_el0 -= 2; + write_sysreg_s((unsigned long)gcspr_el0, SYS_GCSPR_EL0); + + return 0; +} +#else + +static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) +{ + return 0; +} + +#endif + +static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; @@ -1197,7 +1303,7 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->regs[0] = usig; regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; - regs->pc = (unsigned long)ka->sa.sa_handler; + regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* * Signal delivery is a (wacky) indirect function call in @@ -1240,12 +1346,14 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, if (system_supports_poe()) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); - if (ka->sa.sa_flags & SA_RESTORER) - sigtramp = ka->sa.sa_restorer; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + sigtramp = ksig->ka.sa.sa_restorer; else sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); regs->regs[30] = (unsigned long)sigtramp; + + return gcs_signal_entry(sigtramp, ksig); } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -1268,7 +1376,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigframe(&user, regs, set); if (err == 0) { - setup_return(regs, &ksig->ka, &user, usig); + err = setup_return(regs, ksig, &user, usig); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, &ksig->info); regs->regs[1] = (unsigned long)&frame->info; From 16f47bb9ac8afe09e7ca14cc53748f779b2a12e0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:05 +0100 Subject: [PATCH 025/168] arm64/signal: Expose GCS state in signal frames Add a context for the GCS state and include it in the signal context when running on a system that supports GCS. We reuse the same flags that the prctl() uses to specify which GCS features are enabled and also provide the current GCS pointer. We do not support enabling GCS via signal return, there is a conflict between specifying GCSPR_EL0 and allocation of a new GCS and this is not an ancticipated use case. We also enforce GCS configuration locking on signal return. Reviewed-by: Catalin Marinas Reviewed-by: Thiago Jung Bauermann Acked-by: Yury Khrustalev Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-26-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/uapi/asm/sigcontext.h | 9 ++ arch/arm64/kernel/signal.c | 109 +++++++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index bb7af77a30a7..d42f7a92238b 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -183,6 +183,15 @@ struct zt_context { __u16 __reserved[3]; }; +#define GCS_MAGIC 0x47435300 + +struct gcs_context { + struct _aarch64_ctx head; + __u64 gcspr; + __u64 features_enabled; + __u64 reserved; +}; + #endif /* !__ASSEMBLY__ */ #include diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index b5ab0e229a78..62d666278264 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -66,6 +66,7 @@ struct rt_sigframe_user_layout { unsigned long fpsimd_offset; unsigned long esr_offset; + unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; @@ -198,6 +199,8 @@ struct user_ctxs { u32 fpmr_size; struct poe_context __user *poe; u32 poe_size; + struct gcs_context __user *gcs; + u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) @@ -643,6 +646,82 @@ extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ +#ifdef CONFIG_ARM64_GCS + +static int preserve_gcs_context(struct gcs_context __user *ctx) +{ + int err = 0; + u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); + + /* + * If GCS is enabled we will add a cap token to the frame, + * include it in the GCSPR_EL0 we report to support stack + * switching via sigreturn if GCS is enabled. We do not allow + * enabling via sigreturn so the token is only relevant for + * threads with GCS enabled. + */ + if (task_gcs_el0_enabled(current)) + gcspr -= 8; + + __put_user_error(GCS_MAGIC, &ctx->head.magic, err); + __put_user_error(sizeof(*ctx), &ctx->head.size, err); + __put_user_error(gcspr, &ctx->gcspr, err); + __put_user_error(0, &ctx->reserved, err); + __put_user_error(current->thread.gcs_el0_mode, + &ctx->features_enabled, err); + + return err; +} + +static int restore_gcs_context(struct user_ctxs *user) +{ + u64 gcspr, enabled; + int err = 0; + + if (user->gcs_size != sizeof(*user->gcs)) + return -EINVAL; + + __get_user_error(gcspr, &user->gcs->gcspr, err); + __get_user_error(enabled, &user->gcs->features_enabled, err); + if (err) + return err; + + /* Don't allow unknown modes */ + if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + err = gcs_check_locked(current, enabled); + if (err != 0) + return err; + + /* Don't allow enabling */ + if (!task_gcs_el0_enabled(current) && + (enabled & PR_SHADOW_STACK_ENABLE)) + return -EINVAL; + + /* If we are disabling disable everything */ + if (!(enabled & PR_SHADOW_STACK_ENABLE)) + enabled = 0; + + current->thread.gcs_el0_mode = enabled; + + /* + * We let userspace set GCSPR_EL0 to anything here, we will + * validate later in gcs_restore_signal(). + */ + write_sysreg_s(gcspr, SYS_GCSPR_EL0); + + return 0; +} + +#else /* ! CONFIG_ARM64_GCS */ + +/* Turn any non-optimised out attempts to use these into a link error: */ +extern int preserve_gcs_context(void __user *ctx); +extern int restore_gcs_context(struct user_ctxs *user); + +#endif /* ! CONFIG_ARM64_GCS */ + static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { @@ -661,6 +740,7 @@ static int parse_user_sigframe(struct user_ctxs *user, user->zt = NULL; user->fpmr = NULL; user->poe = NULL; + user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; @@ -777,6 +857,17 @@ static int parse_user_sigframe(struct user_ctxs *user, user->fpmr_size = size; break; + case GCS_MAGIC: + if (!system_supports_gcs()) + goto invalid; + + if (user->gcs) + goto invalid; + + user->gcs = (struct gcs_context __user *)head; + user->gcs_size = size; + break; + case EXTRA_MAGIC: if (have_extra_context) goto invalid; @@ -896,6 +987,9 @@ static int restore_sigframe(struct pt_regs *regs, err = restore_fpsimd_context(&user); } + if (err == 0 && system_supports_gcs() && user.gcs) + err = restore_gcs_context(&user); + if (err == 0 && system_supports_tpidr2() && user.tpidr2) err = restore_tpidr2_context(&user); @@ -1029,6 +1123,15 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } +#ifdef CONFIG_ARM64_GCS + if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { + err = sigframe_alloc(user, &user->gcs_offset, + sizeof(struct gcs_context)); + if (err) + return err; + } +#endif + if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; @@ -1136,6 +1239,12 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } + if (system_supports_gcs() && err == 0 && user->gcs_offset) { + struct gcs_context __user *gcs_ctx = + apply_user_offset(user, user->gcs_offset); + err |= preserve_gcs_context(gcs_ctx); + } + /* Scalable Vector Extension state (including streaming), if present */ if ((system_supports_sve() || system_supports_sme()) && err == 0 && user->sve_offset) { From 7ec3b57cb29f8371bf12a725b6e8f75831a03f27 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:06 +0100 Subject: [PATCH 026/168] arm64/ptrace: Expose GCS via ptrace and core files Provide a new register type NT_ARM_GCS reporting the current GCS mode and pointer for EL0. Due to the interactions with allocation and deallocation of Guarded Control Stacks we do not permit any changes to the GCS mode via ptrace, only GCSPR_EL0 may be changed. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-27-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/uapi/asm/ptrace.h | 8 ++++ arch/arm64/kernel/ptrace.c | 62 +++++++++++++++++++++++++++- include/uapi/linux/elf.h | 1 + 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 7fa2f7036aa7..0f39ba4f3efd 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -324,6 +324,14 @@ struct user_za_header { #define ZA_PT_SIZE(vq) \ (ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq)) +/* GCS state (NT_ARM_GCS) */ + +struct user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; + #endif /* __ASSEMBLY__ */ #endif /* _UAPI__ASM_PTRACE_H */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index b756578aeaee..6c1dcfe6d25a 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1473,6 +1474,52 @@ static int poe_set(struct task_struct *target, const struct } #endif +#ifdef CONFIG_ARM64_GCS +static int gcs_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + if (target == current) + gcs_preserve_current_state(); + + user_gcs.features_enabled = target->thread.gcs_el0_mode; + user_gcs.features_locked = target->thread.gcs_el0_locked; + user_gcs.gcspr_el0 = target->thread.gcspr_el0; + + return membuf_write(&to, &user_gcs, sizeof(user_gcs)); +} + +static int gcs_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + struct user_gcs user_gcs; + + if (!system_supports_gcs()) + return -EINVAL; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); + if (ret) + return ret; + + if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) + return -EINVAL; + + target->thread.gcs_el0_mode = user_gcs.features_enabled; + target->thread.gcs_el0_locked = user_gcs.features_locked; + target->thread.gcspr_el0 = user_gcs.gcspr_el0; + + return 0; +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1503,7 +1550,10 @@ enum aarch64_regset { REGSET_TAGGED_ADDR_CTRL, #endif #ifdef CONFIG_ARM64_POE - REGSET_POE + REGSET_POE, +#endif +#ifdef CONFIG_ARM64_GCS + REGSET_GCS, #endif }; @@ -1674,6 +1724,16 @@ static const struct user_regset aarch64_regsets[] = { .set = poe_set, }, #endif +#ifdef CONFIG_ARM64_GCS + [REGSET_GCS] = { + .core_note_type = NT_ARM_GCS, + .n = sizeof(struct user_gcs) / sizeof(u64), + .size = sizeof(u64), + .align = sizeof(u64), + .regset_get = gcs_get, + .set = gcs_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b9935988da5c..9adc218fb6df 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -443,6 +443,7 @@ typedef struct elf64_shdr { #define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ #define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ #define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NT_ARM_GCS 0x410 /* ARM GCS state */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ From 5d8b172e7005c6b42c16a0952c1d8873051d68ae Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:07 +0100 Subject: [PATCH 027/168] arm64: Add Kconfig for Guarded Control Stack (GCS) Provide a Kconfig option allowing the user to select if GCS support is built into the kernel. Reviewed-by: Thiago Jung Bauermann Reviewed-by: Catalin Marinas Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-28-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3e29b44d2d7b..dcb12f041c13 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2178,6 +2178,27 @@ config ARCH_PKEY_BITS endmenu # "ARMv8.9 architectural features" +menu "v9.4 architectural features" + +config ARM64_GCS + bool "Enable support for Guarded Control Stack (GCS)" + default y + select ARCH_HAS_USER_SHADOW_STACK + select ARCH_USES_HIGH_VMA_FLAGS + depends on !UPROBES + help + Guarded Control Stack (GCS) provides support for a separate + stack with restricted access which contains only return + addresses. This can be used to harden against some attacks + by comparing return address used by the program with what is + stored in the GCS, and may also be used to efficiently obtain + the call stack for applications such as profiling. + + The feature is detected at runtime, and will remain disabled + if the system does not implement the feature. + +endmenu # "v9.4 architectural features" + config ARM64_SVE bool "ARM Scalable Vector Extension support" default y From 7a2f671db61f32de0671eeb163a7764e5a258114 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:08 +0100 Subject: [PATCH 028/168] kselftest/arm64: Verify the GCS hwcap Add coverage of the GCS hwcap to the hwcap selftest, using a read of GCSPR_EL0 to generate SIGILL without having to worry about enabling GCS. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-29-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/abi/hwcap.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index f2d6007a2b98..1f07772ae578 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -98,6 +98,17 @@ static void fpmr_sigill(void) asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0"); } +static void gcs_sigill(void) +{ + unsigned long *gcspr; + + asm volatile( + "mrs %0, S3_3_C2_C5_1" + : "=r" (gcspr) + : + : "cc"); +} + static void ilrcpc_sigill(void) { /* LDAPUR W0, [SP, #8] */ @@ -534,6 +545,14 @@ static const struct hwcap_data { .sigill_fn = fpmr_sigill, .sigill_reliable = true, }, + { + .name = "GCS", + .at_hwcap = AT_HWCAP, + .hwcap_bit = HWCAP_GCS, + .cpuinfo = "gcs", + .sigill_fn = gcs_sigill, + .sigill_reliable = true, + }, { .name = "JSCVT", .at_hwcap = AT_HWCAP, From b2d2f11ff5d69cd4b3585ddab4bec9f69503f680 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:09 +0100 Subject: [PATCH 029/168] kselftest/arm64: Add GCS as a detected feature in the signal tests In preparation for testing GCS related signal handling add it as a feature we check for in the signal handling support code. Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-30-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/signal/test_signals.h | 2 ++ tools/testing/selftests/arm64/signal/test_signals_utils.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h index 1e6273d81575..7ada43688c02 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.h +++ b/tools/testing/selftests/arm64/signal/test_signals.h @@ -35,6 +35,7 @@ enum { FSME_BIT, FSME_FA64_BIT, FSME2_BIT, + FGCS_BIT, FMAX_END }; @@ -43,6 +44,7 @@ enum { #define FEAT_SME (1UL << FSME_BIT) #define FEAT_SME_FA64 (1UL << FSME_FA64_BIT) #define FEAT_SME2 (1UL << FSME2_BIT) +#define FEAT_GCS (1UL << FGCS_BIT) /* * A descriptor used to describe and configure a test case. diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c index 0dc948db3a4a..dcc49e3ce1eb 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.c +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c @@ -30,6 +30,7 @@ static char const *const feats_names[FMAX_END] = { " SME ", " FA64 ", " SME2 ", + " GCS ", }; #define MAX_FEATS_SZ 128 @@ -329,6 +330,8 @@ int test_init(struct tdescr *td) td->feats_supported |= FEAT_SME_FA64; if (getauxval(AT_HWCAP2) & HWCAP2_SME2) td->feats_supported |= FEAT_SME2; + if (getauxval(AT_HWCAP) & HWCAP_GCS) + td->feats_supported |= FEAT_GCS; if (feats_ok(td)) { if (td->feats_required & td->feats_supported) fprintf(stderr, From 0d426f7dd9a0d88aa39c1dd54a6bf10f0466c6b9 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:10 +0100 Subject: [PATCH 030/168] kselftest/arm64: Add framework support for GCS to signal handling tests Teach the framework about the GCS signal context, avoiding warnings on the unknown context. Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-31-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/signal/testcases/testcases.c | 7 +++++++ tools/testing/selftests/arm64/signal/testcases/testcases.h | 1 + 2 files changed, 8 insertions(+) diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c index e6daa94fcd2e..0c1a6b26afac 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.c +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c @@ -198,6 +198,13 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err) *err = "Bad size for fpmr_context"; new_flags |= FPMR_CTX; break; + case GCS_MAGIC: + if (flags & GCS_CTX) + *err = "Multiple GCS_MAGIC"; + if (head->size != sizeof(struct gcs_context)) + *err = "Bad size for gcs_context"; + new_flags |= GCS_CTX; + break; case EXTRA_MAGIC: if (flags & EXTRA_CTX) *err = "Multiple EXTRA_MAGIC"; diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h index 9872b8912714..98b97efdda23 100644 --- a/tools/testing/selftests/arm64/signal/testcases/testcases.h +++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h @@ -20,6 +20,7 @@ #define EXTRA_CTX (1 << 3) #define ZT_CTX (1 << 4) #define FPMR_CTX (1 << 5) +#define GCS_CTX (1 << 6) #define KSFT_BAD_MAGIC 0xdeadbeef From 956573ac189066a32326245ebf5abf35b64a490f Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:11 +0100 Subject: [PATCH 031/168] kselftest/arm64: Allow signals tests to specify an expected si_code Currently we ignore si_code unless the expected signal is a SIGSEGV, in which case we enforce it being SEGV_ACCERR. Allow test cases to specify exactly which si_code should be generated so we can validate this, and test for other segfault codes. Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-32-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- .../selftests/arm64/signal/test_signals.h | 4 +++ .../arm64/signal/test_signals_utils.c | 29 ++++++++++++------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h index 7ada43688c02..ee75a2c25ce7 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.h +++ b/tools/testing/selftests/arm64/signal/test_signals.h @@ -71,6 +71,10 @@ struct tdescr { * Zero when no signal is expected on success */ int sig_ok; + /* + * expected si_code for sig_ok, or 0 to not check + */ + int sig_ok_code; /* signum expected on unsupported CPU features. */ int sig_unsupp; /* a timeout in second for test completion */ diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c index dcc49e3ce1eb..5d3621921cfe 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.c +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c @@ -143,16 +143,25 @@ static bool handle_signal_ok(struct tdescr *td, "current->token ZEROED...test is probably broken!\n"); abort(); } - /* - * Trying to narrow down the SEGV to the ones generated by Kernel itself - * via arm64_notify_segfault(). This is a best-effort check anyway, and - * the si_code check may need to change if this aspect of the kernel - * ABI changes. - */ - if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) { - fprintf(stdout, - "si_code != SEGV_ACCERR...test is probably broken!\n"); - abort(); + if (td->sig_ok_code) { + if (si->si_code != td->sig_ok_code) { + fprintf(stdout, "si_code is %d not %d\n", + si->si_code, td->sig_ok_code); + abort(); + } + } else { + /* + * Trying to narrow down the SEGV to the ones + * generated by Kernel itself via + * arm64_notify_segfault(). This is a best-effort + * check anyway, and the si_code check may need to + * change if this aspect of the kernel ABI changes. + */ + if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) { + fprintf(stdout, + "si_code != SEGV_ACCERR...test is probably broken!\n"); + abort(); + } } td->pass = 1; /* From 42155a8eb0f63f634a98ad17a85e9f2826bcff11 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:12 +0100 Subject: [PATCH 032/168] kselftest/arm64: Always run signals tests with GCS enabled Since it is not possible to return from the function that enabled GCS without disabling GCS it is very inconvenient to use the signal handling tests to cover GCS when GCS is not enabled by the toolchain and runtime, something that no current distribution does. Since none of the testcases do anything with stacks that would cause problems with GCS we can sidestep this issue by unconditionally enabling GCS on startup and exiting with a call to exit() rather than a return from main(). Reviewed-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-33-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- .../selftests/arm64/signal/test_signals.c | 17 ++++++++++- .../arm64/signal/test_signals_utils.h | 29 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/signal/test_signals.c b/tools/testing/selftests/arm64/signal/test_signals.c index 00051b40d71e..1304c8ec0f2f 100644 --- a/tools/testing/selftests/arm64/signal/test_signals.c +++ b/tools/testing/selftests/arm64/signal/test_signals.c @@ -7,6 +7,10 @@ * Each test provides its own tde struct tdescr descriptor to link with * this wrapper. Framework provides common helpers. */ + +#include +#include + #include #include "test_signals.h" @@ -16,6 +20,16 @@ struct tdescr *current = &tde; int main(int argc, char *argv[]) { + /* + * Ensure GCS is at least enabled throughout the tests if + * supported, otherwise the inability to return from the + * function that enabled GCS makes it very inconvenient to set + * up test cases. The prctl() may fail if GCS was locked by + * libc setup code. + */ + if (getauxval(AT_HWCAP) & HWCAP_GCS) + gcs_set_state(PR_SHADOW_STACK_ENABLE); + ksft_print_msg("%s :: %s\n", current->name, current->descr); if (test_setup(current) && test_init(current)) { test_run(current); @@ -23,5 +37,6 @@ int main(int argc, char *argv[]) } test_result(current); - return current->result; + /* Do not return in case GCS was enabled */ + exit(current->result); } diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h index 762c8fe9c54a..1e80808ee105 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.h +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h @@ -18,6 +18,35 @@ void test_cleanup(struct tdescr *td); int test_run(struct tdescr *td); void test_result(struct tdescr *td); +#ifndef __NR_prctl +#define __NR_prctl 167 +#endif + +/* + * The prctl takes 1 argument but we need to ensure that the other + * values passed in registers to the syscall are zero since the kernel + * validates them. + */ +#define gcs_set_state(state) \ + ({ \ + register long _num __asm__ ("x8") = __NR_prctl; \ + register long _arg1 __asm__ ("x0") = PR_SET_SHADOW_STACK_STATUS; \ + register long _arg2 __asm__ ("x1") = (long)(state); \ + register long _arg3 __asm__ ("x2") = 0; \ + register long _arg4 __asm__ ("x3") = 0; \ + register long _arg5 __asm__ ("x4") = 0; \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_arg3), "r"(_arg4), \ + "r"(_arg5), "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ + }) + static inline bool feats_ok(struct tdescr *td) { if (td->feats_incompatible & td->feats_supported) From 3d37d4307e0fc958c4461bb6973ce5573d1570c2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:13 +0100 Subject: [PATCH 033/168] kselftest/arm64: Add very basic GCS test program This test program just covers the basic GCS ABI, covering aspects of the ABI as standalone features without attempting to integrate things. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-34-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/Makefile | 2 +- tools/testing/selftests/arm64/gcs/.gitignore | 1 + tools/testing/selftests/arm64/gcs/Makefile | 18 + tools/testing/selftests/arm64/gcs/basic-gcs.c | 357 ++++++++++++++++++ tools/testing/selftests/arm64/gcs/gcs-util.h | 90 +++++ 5 files changed, 467 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/gcs/.gitignore create mode 100644 tools/testing/selftests/arm64/gcs/Makefile create mode 100644 tools/testing/selftests/arm64/gcs/basic-gcs.c create mode 100644 tools/testing/selftests/arm64/gcs/gcs-util.h diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index 28b93cab8c0d..22029e60eff3 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -4,7 +4,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),aarch64 arm64)) -ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi +ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi gcs else ARM64_SUBTARGETS := endif diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore new file mode 100644 index 000000000000..0e5e695ecba5 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -0,0 +1 @@ +basic-gcs diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile new file mode 100644 index 000000000000..61a30f483429 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2023 ARM Limited +# +# In order to avoid interaction with the toolchain and dynamic linker the +# portions of these tests that interact with the GCS are implemented using +# nolibc. +# + +TEST_GEN_PROGS := basic-gcs + +include ../../lib.mk + +$(OUTPUT)/basic-gcs: basic-gcs.c + $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ + -static -include ../../../../include/nolibc/nolibc.h \ + -I../../../../../usr/include \ + -std=gnu99 -I../.. -g \ + -ffreestanding -Wall $^ -o $@ -lgcc diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c new file mode 100644 index 000000000000..3fb9742342a3 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Limited. + */ + +#include +#include + +#include + +#include +#include +#include + +#include "kselftest.h" +#include "gcs-util.h" + +/* nolibc doesn't have sysconf(), just hard code the maximum */ +static size_t page_size = 65536; + +static __attribute__((noinline)) void valid_gcs_function(void) +{ + /* Do something the compiler can't optimise out */ + my_syscall1(__NR_prctl, PR_SVE_GET_VL); +} + +static inline int gcs_set_status(unsigned long mode) +{ + bool enabling = mode & PR_SHADOW_STACK_ENABLE; + int ret; + unsigned long new_mode; + + /* + * The prctl takes 1 argument but we need to ensure that the + * other 3 values passed in registers to the syscall are zero + * since the kernel validates them. + */ + ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode, + 0, 0, 0); + + if (ret == 0) { + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, + &new_mode, 0, 0, 0); + if (ret == 0) { + if (new_mode != mode) { + ksft_print_msg("Mode set to %lx not %lx\n", + new_mode, mode); + ret = -EINVAL; + } + } else { + ksft_print_msg("Failed to validate mode: %d\n", ret); + } + + if (enabling != chkfeat_gcs()) { + ksft_print_msg("%senabled by prctl but %senabled in CHKFEAT\n", + enabling ? "" : "not ", + chkfeat_gcs() ? "" : "not "); + ret = -EINVAL; + } + } + + return ret; +} + +/* Try to read the status */ +static bool read_status(void) +{ + unsigned long state; + int ret; + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, + &state, 0, 0, 0); + if (ret != 0) { + ksft_print_msg("Failed to read state: %d\n", ret); + return false; + } + + return state & PR_SHADOW_STACK_ENABLE; +} + +/* Just a straight enable */ +static bool base_enable(void) +{ + int ret; + + ret = gcs_set_status(PR_SHADOW_STACK_ENABLE); + if (ret) { + ksft_print_msg("PR_SHADOW_STACK_ENABLE failed %d\n", ret); + return false; + } + + return true; +} + +/* Check we can read GCSPR_EL0 when GCS is enabled */ +static bool read_gcspr_el0(void) +{ + unsigned long *gcspr_el0; + + ksft_print_msg("GET GCSPR\n"); + gcspr_el0 = get_gcspr(); + ksft_print_msg("GCSPR_EL0 is %p\n", gcspr_el0); + + return true; +} + +/* Also allow writes to stack */ +static bool enable_writeable(void) +{ + int ret; + + ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE); + if (ret) { + ksft_print_msg("PR_SHADOW_STACK_ENABLE writeable failed: %d\n", ret); + return false; + } + + ret = gcs_set_status(PR_SHADOW_STACK_ENABLE); + if (ret) { + ksft_print_msg("failed to restore plain enable %d\n", ret); + return false; + } + + return true; +} + +/* Also allow writes to stack */ +static bool enable_push_pop(void) +{ + int ret; + + ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH); + if (ret) { + ksft_print_msg("PR_SHADOW_STACK_ENABLE with push failed: %d\n", + ret); + return false; + } + + ret = gcs_set_status(PR_SHADOW_STACK_ENABLE); + if (ret) { + ksft_print_msg("failed to restore plain enable %d\n", ret); + return false; + } + + return true; +} + +/* Enable GCS and allow everything */ +static bool enable_all(void) +{ + int ret; + + ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH | + PR_SHADOW_STACK_WRITE); + if (ret) { + ksft_print_msg("PR_SHADOW_STACK_ENABLE with everything failed: %d\n", + ret); + return false; + } + + ret = gcs_set_status(PR_SHADOW_STACK_ENABLE); + if (ret) { + ksft_print_msg("failed to restore plain enable %d\n", ret); + return false; + } + + return true; +} + +static bool enable_invalid(void) +{ + int ret = gcs_set_status(ULONG_MAX); + if (ret == 0) { + ksft_print_msg("GCS_SET_STATUS %lx succeeded\n", ULONG_MAX); + return false; + } + + return true; +} + +/* Map a GCS */ +static bool map_guarded_stack(void) +{ + int ret; + uint64_t *buf; + uint64_t expected_cap; + int elem; + bool pass = true; + + buf = (void *)my_syscall3(__NR_map_shadow_stack, 0, page_size, + SHADOW_STACK_SET_MARKER | + SHADOW_STACK_SET_TOKEN); + if (buf == MAP_FAILED) { + ksft_print_msg("Failed to map %lu byte GCS: %d\n", + page_size, errno); + return false; + } + ksft_print_msg("Mapped GCS at %p-%p\n", buf, + (void *)((uint64_t)buf + page_size)); + + /* The top of the newly allocated region should be 0 */ + elem = (page_size / sizeof(uint64_t)) - 1; + if (buf[elem]) { + ksft_print_msg("Last entry is 0x%llx not 0x0\n", buf[elem]); + pass = false; + } + + /* Then a valid cap token */ + elem--; + expected_cap = ((uint64_t)buf + page_size - 16); + expected_cap &= GCS_CAP_ADDR_MASK; + expected_cap |= GCS_CAP_VALID_TOKEN; + if (buf[elem] != expected_cap) { + ksft_print_msg("Cap entry is 0x%llx not 0x%llx\n", + buf[elem], expected_cap); + pass = false; + } + ksft_print_msg("cap token is 0x%llx\n", buf[elem]); + + /* The rest should be zeros */ + for (elem = 0; elem < page_size / sizeof(uint64_t) - 2; elem++) { + if (!buf[elem]) + continue; + ksft_print_msg("GCS slot %d is 0x%llx not 0x0\n", + elem, buf[elem]); + pass = false; + } + + ret = munmap(buf, page_size); + if (ret != 0) { + ksft_print_msg("Failed to unmap %ld byte GCS: %d\n", + page_size, errno); + pass = false; + } + + return pass; +} + +/* A fork()ed process can run */ +static bool test_fork(void) +{ + unsigned long child_mode; + int ret, status; + pid_t pid; + bool pass = true; + + pid = fork(); + if (pid == -1) { + ksft_print_msg("fork() failed: %d\n", errno); + pass = false; + goto out; + } + if (pid == 0) { + /* In child, make sure we can call a function, read + * the GCS pointer and status and then exit */ + valid_gcs_function(); + get_gcspr(); + + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, + &child_mode, 0, 0, 0); + if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) { + ksft_print_msg("GCS not enabled in child\n"); + ret = -EINVAL; + } + + exit(ret); + } + + /* + * In parent, check we can still do function calls then block + * for the child. + */ + valid_gcs_function(); + + ksft_print_msg("Waiting for child %d\n", pid); + + ret = waitpid(pid, &status, 0); + if (ret == -1) { + ksft_print_msg("Failed to wait for child: %d\n", + errno); + return false; + } + + if (!WIFEXITED(status)) { + ksft_print_msg("Child exited due to signal %d\n", + WTERMSIG(status)); + pass = false; + } else { + if (WEXITSTATUS(status)) { + ksft_print_msg("Child exited with status %d\n", + WEXITSTATUS(status)); + pass = false; + } + } + +out: + + return pass; +} + +typedef bool (*gcs_test)(void); + +static struct { + char *name; + gcs_test test; + bool needs_enable; +} tests[] = { + { "read_status", read_status }, + { "base_enable", base_enable, true }, + { "read_gcspr_el0", read_gcspr_el0 }, + { "enable_writeable", enable_writeable, true }, + { "enable_push_pop", enable_push_pop, true }, + { "enable_all", enable_all, true }, + { "enable_invalid", enable_invalid, true }, + { "map_guarded_stack", map_guarded_stack }, + { "fork", test_fork }, +}; + +int main(void) +{ + int i, ret; + unsigned long gcs_mode; + + ksft_print_header(); + + /* + * We don't have getauxval() with nolibc so treat a failure to + * read GCS state as a lack of support and skip. + */ + ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, + &gcs_mode, 0, 0, 0); + if (ret != 0) + ksft_exit_skip("Failed to read GCS state: %d\n", ret); + + if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { + gcs_mode = PR_SHADOW_STACK_ENABLE; + ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + gcs_mode, 0, 0, 0); + if (ret != 0) + ksft_exit_fail_msg("Failed to enable GCS: %d\n", ret); + } + + ksft_set_plan(ARRAY_SIZE(tests)); + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + ksft_test_result((*tests[i].test)(), "%s\n", tests[i].name); + } + + /* One last test: disable GCS, we can do this one time */ + my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0); + if (ret != 0) + ksft_print_msg("Failed to disable GCS: %d\n", ret); + + ksft_finished(); + + return 0; +} diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h b/tools/testing/selftests/arm64/gcs/gcs-util.h new file mode 100644 index 000000000000..1ae6864d3f86 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/gcs-util.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Limited. + */ + +#ifndef GCS_UTIL_H +#define GCS_UTIL_H + +#include + +#ifndef __NR_map_shadow_stack +#define __NR_map_shadow_stack 453 +#endif + +#ifndef __NR_prctl +#define __NR_prctl 167 +#endif + +/* Shadow Stack/Guarded Control Stack interface */ +#define PR_GET_SHADOW_STACK_STATUS 74 +#define PR_SET_SHADOW_STACK_STATUS 75 +#define PR_LOCK_SHADOW_STACK_STATUS 76 + +# define PR_SHADOW_STACK_ENABLE (1UL << 0) +# define PR_SHADOW_STACK_WRITE (1UL << 1) +# define PR_SHADOW_STACK_PUSH (1UL << 2) + +#define PR_SHADOW_STACK_ALL_MODES \ + PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH + +#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ +#define SHADOW_STACK_SET_MARKER (1ULL << 1) /* Set up a top of stack merker in the shadow stack */ + +#define GCS_CAP_ADDR_MASK (0xfffffffffffff000UL) +#define GCS_CAP_TOKEN_MASK (0x0000000000000fffUL) +#define GCS_CAP_VALID_TOKEN 1 +#define GCS_CAP_IN_PROGRESS_TOKEN 5 + +#define GCS_CAP(x) (((unsigned long)(x) & GCS_CAP_ADDR_MASK) | \ + GCS_CAP_VALID_TOKEN) + +static inline unsigned long *get_gcspr(void) +{ + unsigned long *gcspr; + + asm volatile( + "mrs %0, S3_3_C2_C5_1" + : "=r" (gcspr) + : + : "cc"); + + return gcspr; +} + +static inline void __attribute__((always_inline)) gcsss1(unsigned long *Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static inline unsigned long __attribute__((always_inline)) *gcsss2(void) +{ + unsigned long *Xt; + + asm volatile( + "SYSL %0, #3, C7, C7, #3\n" + : "=r" (Xt) + : + : "memory"); + + return Xt; +} + +static inline bool chkfeat_gcs(void) +{ + register long val __asm__ ("x16") = 1; + + /* CHKFEAT x16 */ + asm volatile( + "hint #0x28\n" + : "=r" (val) + : "r" (val)); + + return val != 1; +} + +#endif From a505a52b4e292f5e031a01eb3d4e203eb18acb7d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:14 +0100 Subject: [PATCH 034/168] kselftest/arm64: Add a GCS test program built with the system libc There are things like threads which nolibc struggles with which we want to add coverage for, and the ABI allows us to test most of these even if libc itself does not understand GCS so add a test application built using the system libc. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-35-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/gcs/.gitignore | 1 + tools/testing/selftests/arm64/gcs/Makefile | 4 +- tools/testing/selftests/arm64/gcs/gcs-util.h | 10 + tools/testing/selftests/arm64/gcs/libc-gcs.c | 728 +++++++++++++++++++ 4 files changed, 742 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/gcs/libc-gcs.c diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore index 0e5e695ecba5..5810c4a163d4 100644 --- a/tools/testing/selftests/arm64/gcs/.gitignore +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -1 +1,2 @@ basic-gcs +libc-gcs diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index 61a30f483429..a8fdf21e9a47 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -6,7 +6,9 @@ # nolibc. # -TEST_GEN_PROGS := basic-gcs +TEST_GEN_PROGS := basic-gcs libc-gcs + +LDLIBS+=-lpthread include ../../lib.mk diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h b/tools/testing/selftests/arm64/gcs/gcs-util.h index 1ae6864d3f86..c99a6b39ac14 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-util.h +++ b/tools/testing/selftests/arm64/gcs/gcs-util.h @@ -16,6 +16,16 @@ #define __NR_prctl 167 #endif +#ifndef NT_ARM_GCS +#define NT_ARM_GCS 0x410 + +struct user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; +#endif + /* Shadow Stack/Guarded Control Stack interface */ #define PR_GET_SHADOW_STACK_STATUS 74 #define PR_SET_SHADOW_STACK_STATUS 75 diff --git a/tools/testing/selftests/arm64/gcs/libc-gcs.c b/tools/testing/selftests/arm64/gcs/libc-gcs.c new file mode 100644 index 000000000000..17b2fabfec38 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/libc-gcs.c @@ -0,0 +1,728 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Limited. + */ + +#define _GNU_SOURCE + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "kselftest_harness.h" + +#include "gcs-util.h" + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = 0; \ + register long _arg4 __asm__ ("x3") = 0; \ + register long _arg5 __asm__ ("x4") = 0; \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_arg3), "r"(_arg4), \ + "r"(_arg5), "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +static noinline void gcs_recurse(int depth) +{ + if (depth) + gcs_recurse(depth - 1); + + /* Prevent tail call optimization so we actually recurse */ + asm volatile("dsb sy" : : : "memory"); +} + +/* Smoke test that a function call and return works*/ +TEST(can_call_function) +{ + gcs_recurse(0); +} + +static void *gcs_test_thread(void *arg) +{ + int ret; + unsigned long mode; + + /* + * Some libcs don't seem to fill unused arguments with 0 but + * the kernel validates this so we supply all 5 arguments. + */ + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0); + if (ret != 0) { + ksft_print_msg("PR_GET_SHADOW_STACK_STATUS failed: %d\n", ret); + return NULL; + } + + if (!(mode & PR_SHADOW_STACK_ENABLE)) { + ksft_print_msg("GCS not enabled in thread, mode is %lu\n", + mode); + return NULL; + } + + /* Just in case... */ + gcs_recurse(0); + + /* Use a non-NULL value to indicate a pass */ + return &gcs_test_thread; +} + +/* Verify that if we start a new thread it has GCS enabled */ +TEST(gcs_enabled_thread) +{ + pthread_t thread; + void *thread_ret; + int ret; + + ret = pthread_create(&thread, NULL, gcs_test_thread, NULL); + ASSERT_TRUE(ret == 0); + if (ret != 0) + return; + + ret = pthread_join(thread, &thread_ret); + ASSERT_TRUE(ret == 0); + if (ret != 0) + return; + + ASSERT_TRUE(thread_ret != NULL); +} + +/* Read the GCS until we find the terminator */ +TEST(gcs_find_terminator) +{ + unsigned long *gcs, *cur; + + gcs = get_gcspr(); + cur = gcs; + while (*cur) + cur++; + + ksft_print_msg("GCS in use from %p-%p\n", gcs, cur); + + /* + * We should have at least whatever called into this test so + * the two pointer should differ. + */ + ASSERT_TRUE(gcs != cur); +} + +/* + * We can access a GCS via ptrace + * + * This could usefully have a fixture but note that each test is + * fork()ed into a new child whcih causes issues. Might be better to + * lift at least some of this out into a separate, non-harness, test + * program. + */ +TEST(ptrace_read_write) +{ + pid_t child, pid; + int ret, status; + siginfo_t si; + uint64_t val, rval, gcspr; + struct user_gcs child_gcs; + struct iovec iov, local_iov, remote_iov; + + child = fork(); + if (child == -1) { + ksft_print_msg("fork() failed: %d (%s)\n", + errno, strerror(errno)); + ASSERT_NE(child, -1); + } + + if (child == 0) { + /* + * In child, make sure there's something on the stack and + * ask to be traced. + */ + gcs_recurse(0); + if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) + ksft_exit_fail_msg("PTRACE_TRACEME %s", + strerror(errno)); + + if (raise(SIGSTOP)) + ksft_exit_fail_msg("raise(SIGSTOP) %s", + strerror(errno)); + + return; + } + + ksft_print_msg("Child: %d\n", child); + + /* Attach to the child */ + while (1) { + int sig; + + pid = wait(&status); + if (pid == -1) { + ksft_print_msg("wait() failed: %s", + strerror(errno)); + goto error; + } + + /* + * This should never happen but it's hard to flag in + * the framework. + */ + if (pid != child) + continue; + + if (WIFEXITED(status) || WIFSIGNALED(status)) + ksft_exit_fail_msg("Child died unexpectedly\n"); + + if (!WIFSTOPPED(status)) + goto error; + + sig = WSTOPSIG(status); + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)) { + if (errno == ESRCH) { + ASSERT_NE(errno, ESRCH); + return; + } + + if (errno == EINVAL) { + sig = 0; /* bust group-stop */ + goto cont; + } + + ksft_print_msg("PTRACE_GETSIGINFO: %s\n", + strerror(errno)); + goto error; + } + + if (sig == SIGSTOP && si.si_code == SI_TKILL && + si.si_pid == pid) + break; + + cont: + if (ptrace(PTRACE_CONT, pid, NULL, sig)) { + if (errno == ESRCH) { + ASSERT_NE(errno, ESRCH); + return; + } + + ksft_print_msg("PTRACE_CONT: %s\n", strerror(errno)); + goto error; + } + } + + /* Where is the child GCS? */ + iov.iov_base = &child_gcs; + iov.iov_len = sizeof(child_gcs); + ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_GCS, &iov); + if (ret != 0) { + ksft_print_msg("Failed to read child GCS state: %s (%d)\n", + strerror(errno), errno); + goto error; + } + + /* We should have inherited GCS over fork(), confirm */ + if (!(child_gcs.features_enabled & PR_SHADOW_STACK_ENABLE)) { + ASSERT_TRUE(child_gcs.features_enabled & + PR_SHADOW_STACK_ENABLE); + goto error; + } + + gcspr = child_gcs.gcspr_el0; + ksft_print_msg("Child GCSPR 0x%lx, flags %llx, locked %llx\n", + gcspr, child_gcs.features_enabled, + child_gcs.features_locked); + + /* Ideally we'd cross check with the child memory map */ + + errno = 0; + val = ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL); + ret = errno; + if (ret != 0) + ksft_print_msg("PTRACE_PEEKDATA failed: %s (%d)\n", + strerror(ret), ret); + EXPECT_EQ(ret, 0); + + /* The child should be in a function, the GCSPR shouldn't be 0 */ + EXPECT_NE(val, 0); + + /* Same thing via process_vm_readv() */ + local_iov.iov_base = &rval; + local_iov.iov_len = sizeof(rval); + remote_iov.iov_base = (void *)gcspr; + remote_iov.iov_len = sizeof(rval); + ret = process_vm_readv(child, &local_iov, 1, &remote_iov, 1, 0); + if (ret == -1) + ksft_print_msg("process_vm_readv() failed: %s (%d)\n", + strerror(errno), errno); + EXPECT_EQ(ret, sizeof(rval)); + EXPECT_EQ(val, rval); + + /* Write data via a peek */ + ret = ptrace(PTRACE_POKEDATA, child, (void *)gcspr, NULL); + if (ret == -1) + ksft_print_msg("PTRACE_POKEDATA failed: %s (%d)\n", + strerror(errno), errno); + EXPECT_EQ(ret, 0); + EXPECT_EQ(0, ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL)); + + /* Restore what we had before */ + ret = ptrace(PTRACE_POKEDATA, child, (void *)gcspr, val); + if (ret == -1) + ksft_print_msg("PTRACE_POKEDATA failed: %s (%d)\n", + strerror(errno), errno); + EXPECT_EQ(ret, 0); + EXPECT_EQ(val, ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL)); + + /* That's all, folks */ + kill(child, SIGKILL); + return; + +error: + kill(child, SIGKILL); + ASSERT_FALSE(true); +} + +FIXTURE(map_gcs) +{ + unsigned long *stack; +}; + +FIXTURE_VARIANT(map_gcs) +{ + size_t stack_size; + unsigned long flags; +}; + +FIXTURE_VARIANT_ADD(map_gcs, s2k_cap_marker) +{ + .stack_size = 2 * 1024, + .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s2k_cap) +{ + .stack_size = 2 * 1024, + .flags = SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s2k_marker) +{ + .stack_size = 2 * 1024, + .flags = SHADOW_STACK_SET_MARKER, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s2k) +{ + .stack_size = 2 * 1024, + .flags = 0, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s4k_cap_marker) +{ + .stack_size = 4 * 1024, + .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s4k_cap) +{ + .stack_size = 4 * 1024, + .flags = SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s3k_marker) +{ + .stack_size = 4 * 1024, + .flags = SHADOW_STACK_SET_MARKER, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s4k) +{ + .stack_size = 4 * 1024, + .flags = 0, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s16k_cap_marker) +{ + .stack_size = 16 * 1024, + .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s16k_cap) +{ + .stack_size = 16 * 1024, + .flags = SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s16k_marker) +{ + .stack_size = 16 * 1024, + .flags = SHADOW_STACK_SET_MARKER, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s16k) +{ + .stack_size = 16 * 1024, + .flags = 0, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s64k_cap_marker) +{ + .stack_size = 64 * 1024, + .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s64k_cap) +{ + .stack_size = 64 * 1024, + .flags = SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s64k_marker) +{ + .stack_size = 64 * 1024, + .flags = SHADOW_STACK_SET_MARKER, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s64k) +{ + .stack_size = 64 * 1024, + .flags = 0, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s128k_cap_marker) +{ + .stack_size = 128 * 1024, + .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s128k_cap) +{ + .stack_size = 128 * 1024, + .flags = SHADOW_STACK_SET_TOKEN, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s128k_marker) +{ + .stack_size = 128 * 1024, + .flags = SHADOW_STACK_SET_MARKER, +}; + +FIXTURE_VARIANT_ADD(map_gcs, s128k) +{ + .stack_size = 128 * 1024, + .flags = 0, +}; + +FIXTURE_SETUP(map_gcs) +{ + self->stack = (void *)syscall(__NR_map_shadow_stack, 0, + variant->stack_size, + variant->flags); + ASSERT_FALSE(self->stack == MAP_FAILED); + ksft_print_msg("Allocated stack from %p-%p\n", self->stack, + self->stack + variant->stack_size); +} + +FIXTURE_TEARDOWN(map_gcs) +{ + int ret; + + if (self->stack != MAP_FAILED) { + ret = munmap(self->stack, variant->stack_size); + ASSERT_EQ(ret, 0); + } +} + +/* The stack has a cap token */ +TEST_F(map_gcs, stack_capped) +{ + unsigned long *stack = self->stack; + size_t cap_index; + + cap_index = (variant->stack_size / sizeof(unsigned long)); + + switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) { + case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN: + cap_index -= 2; + break; + case SHADOW_STACK_SET_TOKEN: + cap_index -= 1; + break; + case SHADOW_STACK_SET_MARKER: + case 0: + /* No cap, no test */ + return; + } + + ASSERT_EQ(stack[cap_index], GCS_CAP(&stack[cap_index])); +} + +/* The top of the stack is 0 */ +TEST_F(map_gcs, stack_terminated) +{ + unsigned long *stack = self->stack; + size_t term_index; + + if (!(variant->flags & SHADOW_STACK_SET_MARKER)) + return; + + term_index = (variant->stack_size / sizeof(unsigned long)) - 1; + + ASSERT_EQ(stack[term_index], 0); +} + +/* Writes should fault */ +TEST_F_SIGNAL(map_gcs, not_writeable, SIGSEGV) +{ + self->stack[0] = 0; +} + +/* Put it all together, we can safely switch to and from the stack */ +TEST_F(map_gcs, stack_switch) +{ + size_t cap_index; + cap_index = (variant->stack_size / sizeof(unsigned long)); + unsigned long *orig_gcspr_el0, *pivot_gcspr_el0; + + /* Skip over the stack terminator and point at the cap */ + switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) { + case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN: + cap_index -= 2; + break; + case SHADOW_STACK_SET_TOKEN: + cap_index -= 1; + break; + case SHADOW_STACK_SET_MARKER: + case 0: + /* No cap, no test */ + return; + } + pivot_gcspr_el0 = &self->stack[cap_index]; + + /* Pivot to the new GCS */ + ksft_print_msg("Pivoting to %p from %p, target has value 0x%lx\n", + pivot_gcspr_el0, get_gcspr(), + *pivot_gcspr_el0); + gcsss1(pivot_gcspr_el0); + orig_gcspr_el0 = gcsss2(); + ksft_print_msg("Pivoted to %p from %p, target has value 0x%lx\n", + get_gcspr(), orig_gcspr_el0, + *pivot_gcspr_el0); + + ksft_print_msg("Pivoted, GCSPR_EL0 now %p\n", get_gcspr()); + + /* New GCS must be in the new buffer */ + ASSERT_TRUE((unsigned long)get_gcspr() > (unsigned long)self->stack); + ASSERT_TRUE((unsigned long)get_gcspr() <= + (unsigned long)self->stack + variant->stack_size); + + /* We should be able to use all but 2 slots of the new stack */ + ksft_print_msg("Recursing %zu levels\n", cap_index - 1); + gcs_recurse(cap_index - 1); + + /* Pivot back to the original GCS */ + gcsss1(orig_gcspr_el0); + pivot_gcspr_el0 = gcsss2(); + + gcs_recurse(0); + ksft_print_msg("Pivoted back to GCSPR_EL0 0x%p\n", get_gcspr()); +} + +/* We fault if we try to go beyond the end of the stack */ +TEST_F_SIGNAL(map_gcs, stack_overflow, SIGSEGV) +{ + size_t cap_index; + cap_index = (variant->stack_size / sizeof(unsigned long)); + unsigned long *orig_gcspr_el0, *pivot_gcspr_el0; + + /* Skip over the stack terminator and point at the cap */ + switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) { + case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN: + cap_index -= 2; + break; + case SHADOW_STACK_SET_TOKEN: + cap_index -= 1; + break; + case SHADOW_STACK_SET_MARKER: + case 0: + /* No cap, no test but we need to SEGV to avoid a false fail */ + orig_gcspr_el0 = get_gcspr(); + *orig_gcspr_el0 = 0; + return; + } + pivot_gcspr_el0 = &self->stack[cap_index]; + + /* Pivot to the new GCS */ + ksft_print_msg("Pivoting to %p from %p, target has value 0x%lx\n", + pivot_gcspr_el0, get_gcspr(), + *pivot_gcspr_el0); + gcsss1(pivot_gcspr_el0); + orig_gcspr_el0 = gcsss2(); + ksft_print_msg("Pivoted to %p from %p, target has value 0x%lx\n", + pivot_gcspr_el0, orig_gcspr_el0, + *pivot_gcspr_el0); + + ksft_print_msg("Pivoted, GCSPR_EL0 now %p\n", get_gcspr()); + + /* New GCS must be in the new buffer */ + ASSERT_TRUE((unsigned long)get_gcspr() > (unsigned long)self->stack); + ASSERT_TRUE((unsigned long)get_gcspr() <= + (unsigned long)self->stack + variant->stack_size); + + /* Now try to recurse, we should fault doing this. */ + ksft_print_msg("Recursing %zu levels...\n", cap_index + 1); + gcs_recurse(cap_index + 1); + ksft_print_msg("...done\n"); + + /* Clean up properly to try to guard against spurious passes. */ + gcsss1(orig_gcspr_el0); + pivot_gcspr_el0 = gcsss2(); + ksft_print_msg("Pivoted back to GCSPR_EL0 0x%p\n", get_gcspr()); +} + +FIXTURE(map_invalid_gcs) +{ +}; + +FIXTURE_VARIANT(map_invalid_gcs) +{ + size_t stack_size; +}; + +FIXTURE_SETUP(map_invalid_gcs) +{ +} + +FIXTURE_TEARDOWN(map_invalid_gcs) +{ +} + +/* GCS must be larger than 16 bytes */ +FIXTURE_VARIANT_ADD(map_invalid_gcs, too_small) +{ + .stack_size = 8, +}; + +/* GCS size must be 16 byte aligned */ +FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_1) { .stack_size = 1024 + 1 }; +FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_2) { .stack_size = 1024 + 2 }; +FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_3) { .stack_size = 1024 + 3 }; +FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_4) { .stack_size = 1024 + 4 }; +FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_5) { .stack_size = 1024 + 5 }; +FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_6) { .stack_size = 1024 + 6 }; +FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_7) { .stack_size = 1024 + 7 }; + +TEST_F(map_invalid_gcs, do_map) +{ + void *stack; + + stack = (void *)syscall(__NR_map_shadow_stack, 0, + variant->stack_size, 0); + ASSERT_TRUE(stack == MAP_FAILED); + if (stack != MAP_FAILED) + munmap(stack, variant->stack_size); +} + +FIXTURE(invalid_mprotect) +{ + unsigned long *stack; + size_t stack_size; +}; + +FIXTURE_VARIANT(invalid_mprotect) +{ + unsigned long flags; +}; + +FIXTURE_SETUP(invalid_mprotect) +{ + self->stack_size = sysconf(_SC_PAGE_SIZE); + self->stack = (void *)syscall(__NR_map_shadow_stack, 0, + self->stack_size, 0); + ASSERT_FALSE(self->stack == MAP_FAILED); + ksft_print_msg("Allocated stack from %p-%p\n", self->stack, + self->stack + self->stack_size); +} + +FIXTURE_TEARDOWN(invalid_mprotect) +{ + int ret; + + if (self->stack != MAP_FAILED) { + ret = munmap(self->stack, self->stack_size); + ASSERT_EQ(ret, 0); + } +} + +FIXTURE_VARIANT_ADD(invalid_mprotect, exec) +{ + .flags = PROT_EXEC, +}; + +TEST_F(invalid_mprotect, do_map) +{ + int ret; + + ret = mprotect(self->stack, self->stack_size, variant->flags); + ASSERT_EQ(ret, -1); +} + +TEST_F(invalid_mprotect, do_map_read) +{ + int ret; + + ret = mprotect(self->stack, self->stack_size, + variant->flags | PROT_READ); + ASSERT_EQ(ret, -1); +} + +int main(int argc, char **argv) +{ + unsigned long gcs_mode; + int ret; + + if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) + ksft_exit_skip("SKIP GCS not supported\n"); + + /* + * Force shadow stacks on, our tests *should* be fine with or + * without libc support and with or without this having ended + * up tagged for GCS and enabled by the dynamic linker. We + * can't use the libc prctl() function since we can't return + * from enabling the stack. + */ + ret = my_syscall2(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode); + if (ret) { + ksft_print_msg("Failed to read GCS state: %d\n", ret); + return EXIT_FAILURE; + } + + if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) { + gcs_mode = PR_SHADOW_STACK_ENABLE; + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + gcs_mode); + if (ret) { + ksft_print_msg("Failed to configure GCS: %d\n", ret); + return EXIT_FAILURE; + } + } + + /* Avoid returning in case libc doesn't understand GCS */ + exit(test_harness_run(argc, argv)); +} From 58d69a3e35825698b5daddc1a074e9ea19cb0c51 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:15 +0100 Subject: [PATCH 035/168] kselftest/arm64: Add test coverage for GCS mode locking Verify that we can lock individual GCS mode bits, that other modes aren't affected and as a side effect also that every combination of modes can be enabled. Normally the inability to reenable GCS after disabling it would be an issue with testing but fortunately the kselftest_harness runs each test within a fork()ed child. This can be inconvenient for some kinds of testing but here it means that each test is in a separate thread and therefore won't be affected by other tests in the suite. Once we get toolchains with support for enabling GCS by default we will need to take care to not do that in the build system but there are no such toolchains yet so it is not yet an issue. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-36-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/gcs/.gitignore | 1 + tools/testing/selftests/arm64/gcs/Makefile | 2 +- .../testing/selftests/arm64/gcs/gcs-locking.c | 200 ++++++++++++++++++ 3 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/gcs/gcs-locking.c diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore index 5810c4a163d4..0c86f53f68ad 100644 --- a/tools/testing/selftests/arm64/gcs/.gitignore +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -1,2 +1,3 @@ basic-gcs libc-gcs +gcs-locking diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index a8fdf21e9a47..2173d6275956 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -6,7 +6,7 @@ # nolibc. # -TEST_GEN_PROGS := basic-gcs libc-gcs +TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking LDLIBS+=-lpthread diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c new file mode 100644 index 000000000000..989f75a491b7 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Limited. + * + * Tests for GCS mode locking. These tests rely on both having GCS + * unconfigured on entry and on the kselftest harness running each + * test in a fork()ed process which will have it's own mode. + */ + +#include + +#include +#include + +#include + +#include "kselftest_harness.h" + +#include "gcs-util.h" + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num __asm__ ("x8") = (num); \ + register long _arg1 __asm__ ("x0") = (long)(arg1); \ + register long _arg2 __asm__ ("x1") = (long)(arg2); \ + register long _arg3 __asm__ ("x2") = 0; \ + register long _arg4 __asm__ ("x3") = 0; \ + register long _arg5 __asm__ ("x4") = 0; \ + \ + __asm__ volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_arg3), "r"(_arg4), \ + "r"(_arg5), "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +/* No mode bits are rejected for locking */ +TEST(lock_all_modes) +{ + int ret; + + ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, ULONG_MAX, 0, 0, 0); + ASSERT_EQ(ret, 0); +} + +FIXTURE(valid_modes) +{ +}; + +FIXTURE_VARIANT(valid_modes) +{ + unsigned long mode; +}; + +FIXTURE_VARIANT_ADD(valid_modes, enable) +{ + .mode = PR_SHADOW_STACK_ENABLE, +}; + +FIXTURE_VARIANT_ADD(valid_modes, enable_write) +{ + .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE, +}; + +FIXTURE_VARIANT_ADD(valid_modes, enable_push) +{ + .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH, +}; + +FIXTURE_VARIANT_ADD(valid_modes, enable_write_push) +{ + .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | + PR_SHADOW_STACK_PUSH, +}; + +FIXTURE_SETUP(valid_modes) +{ +} + +FIXTURE_TEARDOWN(valid_modes) +{ +} + +/* We can set the mode at all */ +TEST_F(valid_modes, set) +{ + int ret; + + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + variant->mode); + ASSERT_EQ(ret, 0); + + _exit(0); +} + +/* Enabling, locking then disabling is rejected */ +TEST_F(valid_modes, enable_lock_disable) +{ + unsigned long mode; + int ret; + + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + variant->mode); + ASSERT_EQ(ret, 0); + + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mode, variant->mode); + + ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0); + ASSERT_EQ(ret, 0); + + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0); + ASSERT_EQ(ret, -EBUSY); + + _exit(0); +} + +/* Locking then enabling is rejected */ +TEST_F(valid_modes, lock_enable) +{ + unsigned long mode; + int ret; + + ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0); + ASSERT_EQ(ret, 0); + + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + variant->mode); + ASSERT_EQ(ret, -EBUSY); + + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mode, 0); + + _exit(0); +} + +/* Locking then changing other modes is fine */ +TEST_F(valid_modes, lock_enable_disable_others) +{ + unsigned long mode; + int ret; + + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + variant->mode); + ASSERT_EQ(ret, 0); + + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mode, variant->mode); + + ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0); + ASSERT_EQ(ret, 0); + + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + PR_SHADOW_STACK_ALL_MODES); + ASSERT_EQ(ret, 0); + + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES); + + + ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, + variant->mode); + ASSERT_EQ(ret, 0); + + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0); + ASSERT_EQ(ret, 0); + ASSERT_EQ(mode, variant->mode); + + _exit(0); +} + +int main(int argc, char **argv) +{ + unsigned long mode; + int ret; + + if (!(getauxval(AT_HWCAP) & HWCAP_GCS)) + ksft_exit_skip("SKIP GCS not supported\n"); + + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0); + if (ret) { + ksft_print_msg("Failed to read GCS state: %d\n", ret); + return EXIT_FAILURE; + } + + if (mode & PR_SHADOW_STACK_ENABLE) { + ksft_print_msg("GCS was enabled, test unsupported\n"); + return KSFT_SKIP; + } + + return test_harness_run(argc, argv); +} From 794b64ca5665323f36e5fc92dfca02a3797b6523 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:16 +0100 Subject: [PATCH 036/168] kselftest/arm64: Add GCS signal tests Do some testing of the signal handling for GCS, checking that a GCS frame has the expected information in it and that the expected signals are delivered with invalid operations. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-37-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- .../testing/selftests/arm64/signal/.gitignore | 1 + .../arm64/signal/test_signals_utils.h | 10 +++ .../signal/testcases/gcs_exception_fault.c | 62 +++++++++++++ .../arm64/signal/testcases/gcs_frame.c | 88 +++++++++++++++++++ .../arm64/signal/testcases/gcs_write_fault.c | 67 ++++++++++++++ 5 files changed, 228 insertions(+) create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_frame.c create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore index b2f2bfd5c6aa..b257db665a35 100644 --- a/tools/testing/selftests/arm64/signal/.gitignore +++ b/tools/testing/selftests/arm64/signal/.gitignore @@ -3,6 +3,7 @@ mangle_* fake_sigreturn_* fpmr_* poe_* +gcs_* sme_* ssve_* sve_* diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h index 1e80808ee105..36fc12b3cd60 100644 --- a/tools/testing/selftests/arm64/signal/test_signals_utils.h +++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -47,6 +48,15 @@ void test_result(struct tdescr *td); _arg1; \ }) +static inline __attribute__((always_inline)) uint64_t get_gcspr_el0(void) +{ + uint64_t val; + + asm volatile("mrs %0, S3_3_C2_C5_1" : "=r" (val)); + + return val; +} + static inline bool feats_ok(struct tdescr *td) { if (td->feats_incompatible & td->feats_supported) diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c new file mode 100644 index 000000000000..6228448b2ae7 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 ARM Limited + */ + +#include +#include +#include + +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +/* + * We should get this from asm/siginfo.h but the testsuite is being + * clever with redefining siginfo_t. + */ +#ifndef SEGV_CPERR +#define SEGV_CPERR 10 +#endif + +static inline void gcsss1(uint64_t Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static int gcs_op_fault_trigger(struct tdescr *td) +{ + /* + * The slot below our current GCS should be in a valid GCS but + * must not have a valid cap in it. + */ + gcsss1(get_gcspr_el0() - 8); + + return 0; +} + +static int gcs_op_fault_signal(struct tdescr *td, siginfo_t *si, + ucontext_t *uc) +{ + ASSERT_GOOD_CONTEXT(uc); + + return 1; +} + +struct tdescr tde = { + .name = "Invalid GCS operation", + .descr = "An invalid GCS operation generates the expected signal", + .feats_required = FEAT_GCS, + .timeout = 3, + .sig_ok = SIGSEGV, + .sig_ok_code = SEGV_CPERR, + .sanity_disabled = true, + .trigger = gcs_op_fault_trigger, + .run = gcs_op_fault_signal, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c new file mode 100644 index 000000000000..b405d82321da --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 ARM Limited + */ + +#include +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +static union { + ucontext_t uc; + char buf[1024 * 64]; +} context; + +static int gcs_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc) +{ + size_t offset; + struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context); + struct gcs_context *gcs; + unsigned long expected, gcspr; + uint64_t *u64_val; + int ret; + + ret = prctl(PR_GET_SHADOW_STACK_STATUS, &expected, 0, 0, 0); + if (ret != 0) { + fprintf(stderr, "Unable to query GCS status\n"); + return 1; + } + + /* We expect a cap to be added to the GCS in the signal frame */ + gcspr = get_gcspr_el0(); + gcspr -= 8; + fprintf(stderr, "Expecting GCSPR_EL0 %lx\n", gcspr); + + if (!get_current_context(td, &context.uc, sizeof(context))) { + fprintf(stderr, "Failed getting context\n"); + return 1; + } + + /* Ensure that the signal restore token was consumed */ + u64_val = (uint64_t *)get_gcspr_el0() + 1; + if (*u64_val) { + fprintf(stderr, "GCS value at %p is %lx not 0\n", + u64_val, *u64_val); + return 1; + } + + fprintf(stderr, "Got context\n"); + + head = get_header(head, GCS_MAGIC, GET_BUF_RESV_SIZE(context), + &offset); + if (!head) { + fprintf(stderr, "No GCS context\n"); + return 1; + } + + gcs = (struct gcs_context *)head; + + /* Basic size validation is done in get_current_context() */ + + if (gcs->features_enabled != expected) { + fprintf(stderr, "Features enabled %llx but expected %lx\n", + gcs->features_enabled, expected); + return 1; + } + + if (gcs->gcspr != gcspr) { + fprintf(stderr, "Got GCSPR %llx but expected %lx\n", + gcs->gcspr, gcspr); + return 1; + } + + fprintf(stderr, "GCS context validated\n"); + td->pass = 1; + + return 0; +} + +struct tdescr tde = { + .name = "GCS basics", + .descr = "Validate a GCS signal context", + .feats_required = FEAT_GCS, + .timeout = 3, + .run = gcs_regs, +}; diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c new file mode 100644 index 000000000000..faeabb18c4b2 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 ARM Limited + */ + +#include +#include +#include + +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +static uint64_t *gcs_page; + +#ifndef __NR_map_shadow_stack +#define __NR_map_shadow_stack 453 +#endif + +static bool alloc_gcs(struct tdescr *td) +{ + long page_size = sysconf(_SC_PAGE_SIZE); + + gcs_page = (void *)syscall(__NR_map_shadow_stack, 0, + page_size, 0); + if (gcs_page == MAP_FAILED) { + fprintf(stderr, "Failed to map %ld byte GCS: %d\n", + page_size, errno); + return false; + } + + return true; +} + +static int gcs_write_fault_trigger(struct tdescr *td) +{ + /* Verify that the page is readable (ie, not completely unmapped) */ + fprintf(stderr, "Read value 0x%lx\n", gcs_page[0]); + + /* A regular write should trigger a fault */ + gcs_page[0] = EINVAL; + + return 0; +} + +static int gcs_write_fault_signal(struct tdescr *td, siginfo_t *si, + ucontext_t *uc) +{ + ASSERT_GOOD_CONTEXT(uc); + + return 1; +} + + +struct tdescr tde = { + .name = "GCS write fault", + .descr = "Normal writes to a GCS segfault", + .feats_required = FEAT_GCS, + .timeout = 3, + .sig_ok = SIGSEGV, + .sanity_disabled = true, + .init = alloc_gcs, + .trigger = gcs_write_fault_trigger, + .run = gcs_write_fault_signal, +}; From 05e6cfff58c481bfe0ada24ebe1c205e2817dacd Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:17 +0100 Subject: [PATCH 037/168] kselftest/arm64: Add a GCS stress test Add a stress test which runs one more process than we have CPUs spinning through a very recursive function with frequent syscalls immediately prior to return and signals being injected every 100ms. The goal is to flag up any scheduling related issues, for example failure to ensure that barriers are inserted when moving a GCS using task to another CPU. The test runs for a configurable amount of time, defaulting to 10 seconds. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-38-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/gcs/.gitignore | 2 + tools/testing/selftests/arm64/gcs/Makefile | 6 +- .../testing/selftests/arm64/gcs/asm-offsets.h | 0 .../selftests/arm64/gcs/gcs-stress-thread.S | 311 ++++++++++ .../testing/selftests/arm64/gcs/gcs-stress.c | 530 ++++++++++++++++++ 5 files changed, 848 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/gcs/asm-offsets.h create mode 100644 tools/testing/selftests/arm64/gcs/gcs-stress-thread.S create mode 100644 tools/testing/selftests/arm64/gcs/gcs-stress.c diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore index 0c86f53f68ad..1e8d1f6b27f2 100644 --- a/tools/testing/selftests/arm64/gcs/.gitignore +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -1,3 +1,5 @@ basic-gcs libc-gcs gcs-locking +gcs-stress +gcs-stress-thread diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index 2173d6275956..d8b06ca51e22 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -6,7 +6,8 @@ # nolibc. # -TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking +TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress +TEST_GEN_PROGS_EXTENDED := gcs-stress-thread LDLIBS+=-lpthread @@ -18,3 +19,6 @@ $(OUTPUT)/basic-gcs: basic-gcs.c -I../../../../../usr/include \ -std=gnu99 -I../.. -g \ -ffreestanding -Wall $^ -o $@ -lgcc + +$(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S + $(CC) -nostdlib $^ -o $@ diff --git a/tools/testing/selftests/arm64/gcs/asm-offsets.h b/tools/testing/selftests/arm64/gcs/asm-offsets.h new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S new file mode 100644 index 000000000000..b88b25217da5 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S @@ -0,0 +1,311 @@ +// Program that loops for ever doing lots of recursions and system calls, +// intended to be used as part of a stress test for GCS context switching. +// +// Copyright 2015-2023 Arm Ltd + +#include + +#define sa_sz 32 +#define sa_flags 8 +#define sa_handler 0 +#define sa_mask_sz 8 + +#define si_code 8 + +#define SIGINT 2 +#define SIGABRT 6 +#define SIGUSR1 10 +#define SIGSEGV 11 +#define SIGUSR2 12 +#define SIGTERM 15 +#define SEGV_CPERR 10 + +#define SA_NODEFER 1073741824 +#define SA_SIGINFO 4 +#define ucontext_regs 184 + +#define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) + +#define GCSPR_EL0 S3_3_C2_C5_1 + +.macro function name + .macro endfunction + .type \name, @function + .purgem endfunction + .endm +\name: +.endm + +// Print a single character x0 to stdout +// Clobbers x0-x2,x8 +function putc + str x0, [sp, #-16]! + + mov x0, #1 // STDOUT_FILENO + mov x1, sp + mov x2, #1 + mov x8, #__NR_write + svc #0 + + add sp, sp, #16 + ret +endfunction +.globl putc + +// Print a NUL-terminated string starting at address x0 to stdout +// Clobbers x0-x3,x8 +function puts + mov x1, x0 + + mov x2, #0 +0: ldrb w3, [x0], #1 + cbz w3, 1f + add x2, x2, #1 + b 0b + +1: mov w0, #1 // STDOUT_FILENO + mov x8, #__NR_write + svc #0 + + ret +endfunction +.globl puts + +// Utility macro to print a literal string +// Clobbers x0-x4,x8 +.macro puts string + .pushsection .rodata.str1.1, "aMS", @progbits, 1 +.L__puts_literal\@: .string "\string" + .popsection + + ldr x0, =.L__puts_literal\@ + bl puts +.endm + +// Print an unsigned decimal number x0 to stdout +// Clobbers x0-x4,x8 +function putdec + mov x1, sp + str x30, [sp, #-32]! // Result can't be > 20 digits + + mov x2, #0 + strb w2, [x1, #-1]! // Write the NUL terminator + + mov x2, #10 +0: udiv x3, x0, x2 // div-mod loop to generate the digits + msub x0, x3, x2, x0 + add w0, w0, #'0' + strb w0, [x1, #-1]! + mov x0, x3 + cbnz x3, 0b + + ldrb w0, [x1] + cbnz w0, 1f + mov w0, #'0' // Print "0" for 0, not "" + strb w0, [x1, #-1]! + +1: mov x0, x1 + bl puts + + ldr x30, [sp], #32 + ret +endfunction +.globl putdec + +// Print an unsigned decimal number x0 to stdout, followed by a newline +// Clobbers x0-x5,x8 +function putdecn + mov x5, x30 + + bl putdec + mov x0, #'\n' + bl putc + + ret x5 +endfunction +.globl putdecn + +// Fill x1 bytes starting at x0 with 0. +// Clobbers x1, x2. +function memclr + mov w2, #0 +endfunction +.globl memclr + // fall through to memfill + +// Trivial memory fill: fill x1 bytes starting at address x0 with byte w2 +// Clobbers x1 +function memfill + cmp x1, #0 + b.eq 1f + +0: strb w2, [x0], #1 + subs x1, x1, #1 + b.ne 0b + +1: ret +endfunction +.globl memfill + +// w0: signal number +// x1: sa_action +// w2: sa_flags +// Clobbers x0-x6,x8 +function setsignal + str x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]! + + mov w4, w0 + mov x5, x1 + mov w6, w2 + + add x0, sp, #16 + mov x1, #sa_sz + bl memclr + + mov w0, w4 + add x1, sp, #16 + str w6, [x1, #sa_flags] + str x5, [x1, #sa_handler] + mov x2, #0 + mov x3, #sa_mask_sz + mov x8, #__NR_rt_sigaction + svc #0 + + cbz w0, 1f + + puts "sigaction failure\n" + b abort + +1: ldr x30, [sp], #((sa_sz + 15) / 16 * 16 + 16) + ret +endfunction + + +function tickle_handler + // Perhaps collect GCSPR_EL0 here in future? + ret +endfunction + +function terminate_handler + mov w21, w0 + mov x20, x2 + + puts "Terminated by signal " + mov w0, w21 + bl putdec + puts ", no error\n" + + mov x0, #0 + mov x8, #__NR_exit + svc #0 +endfunction + +function segv_handler + // stash the siginfo_t * + mov x20, x1 + + // Disable GCS, we don't want additional faults logging things + mov x0, PR_SET_SHADOW_STACK_STATUS + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x8, #__NR_prctl + svc #0 + + puts "Got SIGSEGV code " + + ldr x21, [x20, #si_code] + mov x0, x21 + bl putdec + + // GCS faults should have si_code SEGV_CPERR + cmp x21, #SEGV_CPERR + bne 1f + + puts " (GCS violation)" +1: + mov x0, '\n' + bl putc + b abort +endfunction + +// Recurse x20 times +.macro recurse id +function recurse\id + stp x29, x30, [sp, #-16]! + mov x29, sp + + cmp x20, 0 + beq 1f + sub x20, x20, 1 + bl recurse\id + +1: + ldp x29, x30, [sp], #16 + + // Do a syscall immediately prior to returning to try to provoke + // scheduling and migration at a point where coherency issues + // might trigger. + mov x8, #__NR_getpid + svc #0 + + ret +endfunction +.endm + +// Generate and use two copies so we're changing the GCS contents +recurse 1 +recurse 2 + +.globl _start +function _start + // Run with GCS + mov x0, PR_SET_SHADOW_STACK_STATUS + mov x1, PR_SHADOW_STACK_ENABLE + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x8, #__NR_prctl + svc #0 + cbz x0, 1f + puts "Failed to enable GCS\n" + b abort +1: + + mov w0, #SIGTERM + adr x1, terminate_handler + mov w2, #SA_SIGINFO + bl setsignal + + mov w0, #SIGUSR1 + adr x1, tickle_handler + mov w2, #SA_SIGINFO + orr w2, w2, #SA_NODEFER + bl setsignal + + mov w0, #SIGSEGV + adr x1, segv_handler + mov w2, #SA_SIGINFO + orr w2, w2, #SA_NODEFER + bl setsignal + + puts "Running\n" + +loop: + // Small recursion depth so we're frequently flipping between + // the two recursors and changing what's on the stack + mov x20, #5 + bl recurse1 + mov x20, #5 + bl recurse2 + b loop +endfunction + +abort: + mov x0, #255 + mov x8, #__NR_exit + svc #0 diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c new file mode 100644 index 000000000000..bdec7ee8cfd5 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2022-3 ARM Limited. + */ + +#define _GNU_SOURCE +#define _POSIX_C_SOURCE 199309L + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../kselftest.h" + +struct child_data { + char *name, *output; + pid_t pid; + int stdout; + bool output_seen; + bool exited; + int exit_status; + int exit_signal; +}; + +static int epoll_fd; +static struct child_data *children; +static struct epoll_event *evs; +static int tests; +static int num_children; +static bool terminate; + +static int startup_pipe[2]; + +static int num_processors(void) +{ + long nproc = sysconf(_SC_NPROCESSORS_CONF); + if (nproc < 0) { + perror("Unable to read number of processors\n"); + exit(EXIT_FAILURE); + } + + return nproc; +} + +static void start_thread(struct child_data *child) +{ + int ret, pipefd[2], i; + struct epoll_event ev; + + ret = pipe(pipefd); + if (ret != 0) + ksft_exit_fail_msg("Failed to create stdout pipe: %s (%d)\n", + strerror(errno), errno); + + child->pid = fork(); + if (child->pid == -1) + ksft_exit_fail_msg("fork() failed: %s (%d)\n", + strerror(errno), errno); + + if (!child->pid) { + /* + * In child, replace stdout with the pipe, errors to + * stderr from here as kselftest prints to stdout. + */ + ret = dup2(pipefd[1], 1); + if (ret == -1) { + fprintf(stderr, "dup2() %d\n", errno); + exit(EXIT_FAILURE); + } + + /* + * Duplicate the read side of the startup pipe to + * FD 3 so we can close everything else. + */ + ret = dup2(startup_pipe[0], 3); + if (ret == -1) { + fprintf(stderr, "dup2() %d\n", errno); + exit(EXIT_FAILURE); + } + + /* + * Very dumb mechanism to clean open FDs other than + * stdio. We don't want O_CLOEXEC for the pipes... + */ + for (i = 4; i < 8192; i++) + close(i); + + /* + * Read from the startup pipe, there should be no data + * and we should block until it is closed. We just + * carry on on error since this isn't super critical. + */ + ret = read(3, &i, sizeof(i)); + if (ret < 0) + fprintf(stderr, "read(startp pipe) failed: %s (%d)\n", + strerror(errno), errno); + if (ret > 0) + fprintf(stderr, "%d bytes of data on startup pipe\n", + ret); + close(3); + + ret = execl("gcs-stress-thread", "gcs-stress-thread", NULL); + fprintf(stderr, "execl(gcs-stress-thread) failed: %d (%s)\n", + errno, strerror(errno)); + + exit(EXIT_FAILURE); + } else { + /* + * In parent, remember the child and close our copy of the + * write side of stdout. + */ + close(pipefd[1]); + child->stdout = pipefd[0]; + child->output = NULL; + child->exited = false; + child->output_seen = false; + + ev.events = EPOLLIN | EPOLLHUP; + ev.data.ptr = child; + + ret = asprintf(&child->name, "Thread-%d", child->pid); + if (ret == -1) + ksft_exit_fail_msg("asprintf() failed\n"); + + ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, child->stdout, &ev); + if (ret < 0) { + ksft_exit_fail_msg("%s EPOLL_CTL_ADD failed: %s (%d)\n", + child->name, strerror(errno), errno); + } + } + + ksft_print_msg("Started %s\n", child->name); + num_children++; +} + +static bool child_output_read(struct child_data *child) +{ + char read_data[1024]; + char work[1024]; + int ret, len, cur_work, cur_read; + + ret = read(child->stdout, read_data, sizeof(read_data)); + if (ret < 0) { + if (errno == EINTR) + return true; + + ksft_print_msg("%s: read() failed: %s (%d)\n", + child->name, strerror(errno), + errno); + return false; + } + len = ret; + + child->output_seen = true; + + /* Pick up any partial read */ + if (child->output) { + strncpy(work, child->output, sizeof(work) - 1); + cur_work = strnlen(work, sizeof(work)); + free(child->output); + child->output = NULL; + } else { + cur_work = 0; + } + + cur_read = 0; + while (cur_read < len) { + work[cur_work] = read_data[cur_read++]; + + if (work[cur_work] == '\n') { + work[cur_work] = '\0'; + ksft_print_msg("%s: %s\n", child->name, work); + cur_work = 0; + } else { + cur_work++; + } + } + + if (cur_work) { + work[cur_work] = '\0'; + ret = asprintf(&child->output, "%s", work); + if (ret == -1) + ksft_exit_fail_msg("Out of memory\n"); + } + + return false; +} + +static void child_output(struct child_data *child, uint32_t events, + bool flush) +{ + bool read_more; + + if (events & EPOLLIN) { + do { + read_more = child_output_read(child); + } while (read_more); + } + + if (events & EPOLLHUP) { + close(child->stdout); + child->stdout = -1; + flush = true; + } + + if (flush && child->output) { + ksft_print_msg("%s: %s\n", child->name, child->output); + free(child->output); + child->output = NULL; + } +} + +static void child_tickle(struct child_data *child) +{ + if (child->output_seen && !child->exited) + kill(child->pid, SIGUSR1); +} + +static void child_stop(struct child_data *child) +{ + if (!child->exited) + kill(child->pid, SIGTERM); +} + +static void child_cleanup(struct child_data *child) +{ + pid_t ret; + int status; + bool fail = false; + + if (!child->exited) { + do { + ret = waitpid(child->pid, &status, 0); + if (ret == -1 && errno == EINTR) + continue; + + if (ret == -1) { + ksft_print_msg("waitpid(%d) failed: %s (%d)\n", + child->pid, strerror(errno), + errno); + fail = true; + break; + } + + if (WIFEXITED(status)) { + child->exit_status = WEXITSTATUS(status); + child->exited = true; + } + + if (WIFSIGNALED(status)) { + child->exit_signal = WTERMSIG(status); + ksft_print_msg("%s: Exited due to signal %d\n", + child->name); + fail = true; + child->exited = true; + } + } while (!child->exited); + } + + if (!child->output_seen) { + ksft_print_msg("%s no output seen\n", child->name); + fail = true; + } + + if (child->exit_status != 0) { + ksft_print_msg("%s exited with error code %d\n", + child->name, child->exit_status); + fail = true; + } + + ksft_test_result(!fail, "%s\n", child->name); +} + +static void handle_child_signal(int sig, siginfo_t *info, void *context) +{ + int i; + bool found = false; + + for (i = 0; i < num_children; i++) { + if (children[i].pid == info->si_pid) { + children[i].exited = true; + children[i].exit_status = info->si_status; + found = true; + break; + } + } + + if (!found) + ksft_print_msg("SIGCHLD for unknown PID %d with status %d\n", + info->si_pid, info->si_status); +} + +static void handle_exit_signal(int sig, siginfo_t *info, void *context) +{ + int i; + + /* If we're already exiting then don't signal again */ + if (terminate) + return; + + ksft_print_msg("Got signal, exiting...\n"); + + terminate = true; + + /* + * This should be redundant, the main loop should clean up + * after us, but for safety stop everything we can here. + */ + for (i = 0; i < num_children; i++) + child_stop(&children[i]); +} + +/* Handle any pending output without blocking */ +static void drain_output(bool flush) +{ + int ret = 1; + int i; + + while (ret > 0) { + ret = epoll_wait(epoll_fd, evs, tests, 0); + if (ret < 0) { + if (errno == EINTR) + continue; + ksft_print_msg("epoll_wait() failed: %s (%d)\n", + strerror(errno), errno); + } + + for (i = 0; i < ret; i++) + child_output(evs[i].data.ptr, evs[i].events, flush); + } +} + +static const struct option options[] = { + { "timeout", required_argument, NULL, 't' }, + { } +}; + +int main(int argc, char **argv) +{ + int seen_children; + bool all_children_started = false; + int gcs_threads; + int timeout = 10; + int ret, cpus, i, c; + struct sigaction sa; + + while ((c = getopt_long(argc, argv, "t:", options, NULL)) != -1) { + switch (c) { + case 't': + ret = sscanf(optarg, "%d", &timeout); + if (ret != 1) + ksft_exit_fail_msg("Failed to parse timeout %s\n", + optarg); + break; + default: + ksft_exit_fail_msg("Unknown argument\n"); + } + } + + cpus = num_processors(); + tests = 0; + + if (getauxval(AT_HWCAP) & HWCAP_GCS) { + /* One extra thread, trying to trigger migrations */ + gcs_threads = cpus + 1; + tests += gcs_threads; + } else { + gcs_threads = 0; + } + + ksft_print_header(); + ksft_set_plan(tests); + + ksft_print_msg("%d CPUs, %d GCS threads\n", + cpus, gcs_threads); + + if (!tests) + ksft_exit_skip("No tests scheduled\n"); + + if (timeout > 0) + ksft_print_msg("Will run for %ds\n", timeout); + else + ksft_print_msg("Will run until terminated\n"); + + children = calloc(sizeof(*children), tests); + if (!children) + ksft_exit_fail_msg("Unable to allocate child data\n"); + + ret = epoll_create1(EPOLL_CLOEXEC); + if (ret < 0) + ksft_exit_fail_msg("epoll_create1() failed: %s (%d)\n", + strerror(errno), ret); + epoll_fd = ret; + + /* Create a pipe which children will block on before execing */ + ret = pipe(startup_pipe); + if (ret != 0) + ksft_exit_fail_msg("Failed to create startup pipe: %s (%d)\n", + strerror(errno), errno); + + /* Get signal handers ready before we start any children */ + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handle_exit_signal; + sa.sa_flags = SA_RESTART | SA_SIGINFO; + sigemptyset(&sa.sa_mask); + ret = sigaction(SIGINT, &sa, NULL); + if (ret < 0) + ksft_print_msg("Failed to install SIGINT handler: %s (%d)\n", + strerror(errno), errno); + ret = sigaction(SIGTERM, &sa, NULL); + if (ret < 0) + ksft_print_msg("Failed to install SIGTERM handler: %s (%d)\n", + strerror(errno), errno); + sa.sa_sigaction = handle_child_signal; + ret = sigaction(SIGCHLD, &sa, NULL); + if (ret < 0) + ksft_print_msg("Failed to install SIGCHLD handler: %s (%d)\n", + strerror(errno), errno); + + evs = calloc(tests, sizeof(*evs)); + if (!evs) + ksft_exit_fail_msg("Failed to allocated %d epoll events\n", + tests); + + for (i = 0; i < gcs_threads; i++) + start_thread(&children[i]); + + /* + * All children started, close the startup pipe and let them + * run. + */ + close(startup_pipe[0]); + close(startup_pipe[1]); + + timeout *= 10; + for (;;) { + /* Did we get a signal asking us to exit? */ + if (terminate) + break; + + /* + * Timeout is counted in 100ms with no output, the + * tests print during startup then are silent when + * running so this should ensure they all ran enough + * to install the signal handler, this is especially + * useful in emulation where we will both be slow and + * likely to have a large set of VLs. + */ + ret = epoll_wait(epoll_fd, evs, tests, 100); + if (ret < 0) { + if (errno == EINTR) + continue; + ksft_exit_fail_msg("epoll_wait() failed: %s (%d)\n", + strerror(errno), errno); + } + + /* Output? */ + if (ret > 0) { + for (i = 0; i < ret; i++) { + child_output(evs[i].data.ptr, evs[i].events, + false); + } + continue; + } + + /* Otherwise epoll_wait() timed out */ + + /* + * If the child processes have not produced output they + * aren't actually running the tests yet. + */ + if (!all_children_started) { + seen_children = 0; + + for (i = 0; i < num_children; i++) + if (children[i].output_seen || + children[i].exited) + seen_children++; + + if (seen_children != num_children) { + ksft_print_msg("Waiting for %d children\n", + num_children - seen_children); + continue; + } + + all_children_started = true; + } + + ksft_print_msg("Sending signals, timeout remaining: %d00ms\n", + timeout); + + for (i = 0; i < num_children; i++) + child_tickle(&children[i]); + + /* Negative timeout means run indefinitely */ + if (timeout < 0) + continue; + if (--timeout == 0) + break; + } + + ksft_print_msg("Finishing up...\n"); + terminate = true; + + for (i = 0; i < tests; i++) + child_stop(&children[i]); + + drain_output(false); + + for (i = 0; i < tests; i++) + child_cleanup(&children[i]); + + drain_output(true); + + ksft_finished(); +} From bb9ae1a66c85eeb626864efd812c62026e126ec0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 1 Oct 2024 23:59:18 +0100 Subject: [PATCH 038/168] kselftest/arm64: Enable GCS for the FP stress tests While it's a bit off topic for them the floating point stress tests do give us some coverage of context thrashing cases, and also of active signal delivery separate to the relatively complicated framework in the actual signals tests. Have the tests enable GCS on startup, ignoring failures so they continue to work as before on systems without GCS. Reviewed-by: Thiago Jung Bauermann Tested-by: Thiago Jung Bauermann Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-39-222b78d87eee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/assembler.h | 15 +++++++++++++++ tools/testing/selftests/arm64/fp/fpsimd-test.S | 2 ++ tools/testing/selftests/arm64/fp/sve-test.S | 2 ++ tools/testing/selftests/arm64/fp/za-test.S | 2 ++ tools/testing/selftests/arm64/fp/zt-test.S | 2 ++ 5 files changed, 23 insertions(+) diff --git a/tools/testing/selftests/arm64/fp/assembler.h b/tools/testing/selftests/arm64/fp/assembler.h index 9b38a0da407d..1fc46a5642c2 100644 --- a/tools/testing/selftests/arm64/fp/assembler.h +++ b/tools/testing/selftests/arm64/fp/assembler.h @@ -65,4 +65,19 @@ endfunction bl puts .endm +#define PR_SET_SHADOW_STACK_STATUS 75 +# define PR_SHADOW_STACK_ENABLE (1UL << 0) + +.macro enable_gcs + // Run with GCS + mov x0, PR_SET_SHADOW_STACK_STATUS + mov x1, PR_SHADOW_STACK_ENABLE + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x8, #__NR_prctl + svc #0 +.endm + #endif /* ! ASSEMBLER_H */ diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S index 8b960d01ed2e..b16fb7f42e3e 100644 --- a/tools/testing/selftests/arm64/fp/fpsimd-test.S +++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S @@ -215,6 +215,8 @@ endfunction // Main program entry point .globl _start function _start + enable_gcs + mov x23, #0 // signal count mov w0, #SIGINT diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index fff60e2a25ad..2fb4f0b84476 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -378,6 +378,8 @@ endfunction // Main program entry point .globl _start function _start + enable_gcs + mov x23, #0 // Irritation signal count mov w0, #SIGINT diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S index 095b45531640..b2603aba99de 100644 --- a/tools/testing/selftests/arm64/fp/za-test.S +++ b/tools/testing/selftests/arm64/fp/za-test.S @@ -231,6 +231,8 @@ endfunction // Main program entry point .globl _start function _start + enable_gcs + mov x23, #0 // signal count mov w0, #SIGINT diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index b5c81e81a379..8d9609a49008 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -200,6 +200,8 @@ endfunction // Main program entry point .globl _start function _start + enable_gcs + mov x23, #0 // signal count mov w0, #SIGINT From cc847678998305c72c9850efa3fd040b274a8180 Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Thu, 19 Sep 2024 11:46:01 +0800 Subject: [PATCH 039/168] drivers perf: remove unused field pmu_node The driver does not use the pmu_node field, so remove it. Signed-off-by: Yunhui Cui Reviewed-by: Shuai Xue Link: https://lore.kernel.org/r/20240919034601.2453-1-cuiyunhui@bytedance.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 4ca50f9b6dfe..59526a48499f 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -82,7 +82,6 @@ struct dwc_pcie_pmu { u16 ras_des_offset; u32 nr_lanes; - struct list_head pmu_node; struct hlist_node cpuhp_node; struct perf_event *event[DWC_PCIE_EVENT_TYPE_MAX]; int on_cpu; From 6105c5d46d0b39ccf01e329fc4449ba840c2937d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:49 +0100 Subject: [PATCH 040/168] arm64: probes: Move kprobes-specific fields We share struct arch_probe_insn between krpboes and uprobes, but most of its fields aren't necessary for uprobes: * The 'insn' field is only used by kprobes as a pointer to the XOL slot. * The 'restore' field is only used by probes as the PC to restore after stepping an instruction in the XOL slot. * The 'pstate_cc' field isn't used by kprobes or uprobes, and seems to only exist as a result of copy-pasting the 32-bit arm implementation of kprobes. As these fields live in struct arch_probe_insn they cannot use definitions that only exist when CONFIG_KPROBES=y, such as the kprobe_opcode_t typedef, which we'd like to use in subsequent patches. Clean this up by removing the 'pstate_cc' field, and moving the kprobes-specific fields into the kprobes-specific struct arch_specific_insn. To make it clear that the fields are related to stepping instructions in the XOL slot, 'insn' is renamed to 'xol_insn' and 'restore' is renamed to 'xol_restore' At the same time, remove the misleading and useless comment above struct arch_probe_insn. The should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/probes.h | 8 +++----- arch/arm64/kernel/probes/kprobes.c | 30 +++++++++++++++--------------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 006946745352..4aa54322794d 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -12,18 +12,16 @@ typedef u32 probe_opcode_t; typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *); -/* architecture specific copy of original instruction */ struct arch_probe_insn { - probe_opcode_t *insn; - pstate_check_t *pstate_cc; probes_handler_t *handler; - /* restore address after step xol */ - unsigned long restore; }; #ifdef CONFIG_KPROBES typedef u32 kprobe_opcode_t; struct arch_specific_insn { struct arch_probe_insn api; + probe_opcode_t *xol_insn; + /* restore address after step xol */ + unsigned long xol_restore; }; #endif diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 4268678d0e86..222419a41a40 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -43,7 +43,7 @@ post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { - kprobe_opcode_t *addr = p->ainsn.api.insn; + kprobe_opcode_t *addr = p->ainsn.xol_insn; /* * Prepare insn slot, Mark Rutland points out it depends on a coupe of @@ -70,14 +70,14 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p) /* * Needs restoring of return address after stepping xol. */ - p->ainsn.api.restore = (unsigned long) p->addr + + p->ainsn.xol_restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.api.restore = 0; + p->ainsn.xol_restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) @@ -110,18 +110,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.api.insn = NULL; + p->ainsn.xol_insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.api.insn = get_insn_slot(); - if (!p->ainsn.api.insn) + p->ainsn.xol_insn = get_insn_slot(); + if (!p->ainsn.xol_insn) return -ENOMEM; break; } /* prepare the instruction */ - if (p->ainsn.api.insn) + if (p->ainsn.xol_insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -148,9 +148,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.api.insn) { - free_insn_slot(p->ainsn.api.insn, 0); - p->ainsn.api.insn = NULL; + if (p->ainsn.xol_insn) { + free_insn_slot(p->ainsn.xol_insn, 0); + p->ainsn.xol_insn = NULL; } } @@ -205,9 +205,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.api.insn) { + if (p->ainsn.xol_insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.api.insn; + slot = (unsigned long)p->ainsn.xol_insn; kprobes_save_local_irqflag(kcb, regs); instruction_pointer_set(regs, slot); @@ -245,8 +245,8 @@ static void __kprobes post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { /* return addr restore if non-branching insn */ - if (cur->ainsn.api.restore != 0) - instruction_pointer_set(regs, cur->ainsn.api.restore); + if (cur->ainsn.xol_restore != 0) + instruction_pointer_set(regs, cur->ainsn.xol_restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -348,7 +348,7 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr) struct kprobe *cur = kprobe_running(); if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) && - ((unsigned long)&cur->ainsn.api.insn[1] == addr)) { + ((unsigned long)&cur->ainsn.xol_insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); post_kprobe_handler(cur, kcb, regs); From dd0eb50e7c71fdd74f2ab0ef4b60bd6b27bbc670 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:50 +0100 Subject: [PATCH 041/168] arm64: probes: Cleanup kprobes endianness conversions The core kprobes code uses kprobe_opcode_t for the in-memory representation of an instruction, using 'kprobe_opcode_t *' for XOL slots. As arm64 instructions are always little-endian 32-bit values, kprobes_opcode_t should be __le32, but at the moment kprobe_opcode_t is typedef'd to u32. Today there is no functional issue as we convert values via cpu_to_le32() and le32_to_cpu() where necessary, but these conversions are inconsistent with the types used, causing sparse warnings: | CHECK arch/arm64/kernel/probes/kprobes.c | arch/arm64/kernel/probes/kprobes.c:102:21: warning: cast to restricted __le32 | CHECK arch/arm64/kernel/probes/decode-insn.c | arch/arm64/kernel/probes/decode-insn.c:122:46: warning: cast to restricted __le32 | arch/arm64/kernel/probes/decode-insn.c:124:50: warning: cast to restricted __le32 | arch/arm64/kernel/probes/decode-insn.c:136:31: warning: cast to restricted __le32 Improve this by making kprobes_opcode_t a typedef for __le32 and consistently using this for pointers to executable instructions. With this change we can rely on the type system to tell us where conversions are necessary. Since kprobe::opcode is changed from u32 to __le32, the existing le32_to_cpu() converion moves from the point this is initialized (in arch_prepare_kprobe()) to the points this is consumed when passed to a handler or text patching function. As kprobe::opcode isn't altered or consumed elsewhere, this shouldn't result in a functional change. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/probes.h | 4 ++-- arch/arm64/kernel/probes/decode-insn.c | 2 +- arch/arm64/kernel/probes/kprobes.c | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 4aa54322794d..11e809733b7d 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -16,10 +16,10 @@ struct arch_probe_insn { probes_handler_t *handler; }; #ifdef CONFIG_KPROBES -typedef u32 kprobe_opcode_t; +typedef __le32 kprobe_opcode_t; struct arch_specific_insn { struct arch_probe_insn api; - probe_opcode_t *xol_insn; + kprobe_opcode_t *xol_insn; /* restore address after step xol */ unsigned long xol_restore; }; diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 3496d6169e59..147d6ddf3a4c 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -134,7 +134,7 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { enum probe_insn decoded; probe_opcode_t insn = le32_to_cpu(*addr); - probe_opcode_t *scan_end = NULL; + kprobe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; struct arch_probe_insn *api = &asi->api; diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 222419a41a40..48d88e07611d 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -64,7 +64,7 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p) * the BRK exception handler, so it is unnecessary to generate * Contex-Synchronization-Event via ISB again. */ - aarch64_insn_patch_text_nosync(addr, p->opcode); + aarch64_insn_patch_text_nosync(addr, le32_to_cpu(p->opcode)); aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS); /* @@ -85,7 +85,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); if (p->ainsn.api.handler) - p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); + p->ainsn.api.handler(le32_to_cpu(p->opcode), (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(p, kcb, regs); @@ -99,7 +99,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; /* copy instruction */ - p->opcode = le32_to_cpu(*p->addr); + p->opcode = *p->addr; if (search_exception_tables(probe_addr)) return -EINVAL; @@ -142,8 +142,9 @@ void __kprobes arch_arm_kprobe(struct kprobe *p) void __kprobes arch_disarm_kprobe(struct kprobe *p) { void *addr = p->addr; + u32 insn = le32_to_cpu(p->opcode); - aarch64_insn_patch_text(&addr, &p->opcode, 1); + aarch64_insn_patch_text(&addr, &insn, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) From 14762109de024837dafcb893d45ccaf6a08ac990 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Oct 2024 16:58:51 +0100 Subject: [PATCH 042/168] arm64: probes: Remove probe_opcode_t The probe_opcode_t typedef for u32 isn't necessary, and is a source of confusion as it is easily confused with kprobe_opcode_t, which is a typedef for __le32. The typedef is only used within arch/arm64, and all of arm64's commn insn code uses u32 for the endian-agnostic value of an instruction, so it'd be clearer to use u32 consistently. Remove probe_opcode_t and use u32 directly. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20241008155851.801546-7-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/probes.h | 1 - arch/arm64/kernel/probes/decode-insn.c | 4 ++-- arch/arm64/kernel/probes/decode-insn.h | 2 +- arch/arm64/kernel/probes/uprobes.c | 4 ++-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 11e809733b7d..d49368886309 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -9,7 +9,6 @@ #include -typedef u32 probe_opcode_t; typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *); struct arch_probe_insn { diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 147d6ddf3a4c..41b100bcb041 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -73,7 +73,7 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) { /* * Instructions reading or modifying the PC won't work from the XOL @@ -133,7 +133,7 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { enum probe_insn decoded; - probe_opcode_t insn = le32_to_cpu(*addr); + u32 insn = le32_to_cpu(*addr); kprobe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; struct arch_probe_insn *api = &asi->api; diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index 8b758c5a2062..0e4195de8206 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -28,6 +28,6 @@ enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); #endif enum probe_insn __kprobes -arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); +arm_probe_decode_insn(u32 insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index a2f137a595fc..fa0b7941d204 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -34,7 +34,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - probe_opcode_t insn; + u32 insn; /* TODO: Currently we do not support AARCH32 instruction probing */ if (mm->context.flags & MMCF_AARCH32) @@ -102,7 +102,7 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - probe_opcode_t insn; + u32 insn; unsigned long addr; if (!auprobe->simulate) From ab23df141f5318cf661504fb446159ce2dbd1ec2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:15 +0100 Subject: [PATCH 043/168] arm64: asm-offsets: remove TSK_ACTIVE_MM The TSK_ACTIVE_MM definition isn't used anywhere. Remove it. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 27de1dddb0ab..42fd36027545 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -28,8 +28,6 @@ int main(void) { - DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); - BLANK(); DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); From 7bd8870af8dde527e0e8e83838de8966418c58f3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:16 +0100 Subject: [PATCH 044/168] arm64: asm-offsets: remove VMA_VM_* The VMA_VM_MM definition is only used by the vma_vm_mm macro, which itself is unused. The VMA_VM_FLAGS definition isn't used anywhere. Remove them all. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/assembler.h | 7 ------- arch/arm64/kernel/asm-offsets.c | 3 --- 2 files changed, 10 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bc0b0d75acef..3d8d534a7a77 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -248,13 +248,6 @@ alternative_endif ldr \dst, [\dst, \tmp] .endm -/* - * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm) - */ - .macro vma_vm_mm, rd, rn - ldr \rd, [\rn, #VMA_VM_MM] - .endm - /* * read_ctr - read CTR_EL0. If the system has mismatched register fields, * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 42fd36027545..5f8b83915fac 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -104,9 +104,6 @@ int main(void) #endif DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); BLANK(); - DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); - DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); - BLANK(); DEFINE(VM_EXEC, VM_EXEC); BLANK(); DEFINE(PAGE_SZ, PAGE_SIZE); From 4c92c121c402cf3a9a60f58fbd2650577c68f1ac Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:17 +0100 Subject: [PATCH 045/168] arm64: asm-offsets: remove COMPAT_{RT_,SIGFRAME_REGS_OFFSET The COMPAT_SIGFRAME_REGS_OFFSET and COMPAT_RT_SIGFRAME_REGS_OFFSET defintions aren't used anywhere. They were added in commit: f14d8025d263f3c8 ("arm64: compat: Generate asm offsets for signals") ... and subsequently their only user was removed in commit: 2d071968a4052e58 ("arm64: compat: Remove 32-bit sigreturn code from the vDSO") ... leaving them unused. Remove them. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 5f8b83915fac..520914b901a2 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -96,11 +95,6 @@ int main(void) #endif DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); -#endif -#ifdef CONFIG_COMPAT - DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); - DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0)); - BLANK(); #endif DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); BLANK(); From 1abc7c1e593337491d27b84a2a0b79c6d8164811 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:18 +0100 Subject: [PATCH 046/168] arm64: asm-offsets: remove MM_CONTEXT_ID The only user of the MM_CONTEXT_ID defintion was removed in commit: 25b92693a1b67a47 ("arm64: mm: convert cpu_do_switch_mm() to C") Remove MM_CONTEXT_ID. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 520914b901a2..e6a6b5160f75 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -96,8 +96,6 @@ int main(void) DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); #endif - DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); - BLANK(); DEFINE(VM_EXEC, VM_EXEC); BLANK(); DEFINE(PAGE_SZ, PAGE_SIZE); From 4ce689b4480a528f8d088c71d26744944e360f76 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:19 +0100 Subject: [PATCH 047/168] arm64: asm-offsets: remove VM_EXEC and PAGE_SZ The VM_EXEC definition duplicates the common VM_EXEC definition from . The common definition cannot safely be included by assembly code but currently we don't need to use VM_EXEC in assembly. The PAGE_SZ definition duplicates arm64's definition of PAGE_SIZE from which can safely be included from assembly code and should be used directly. Remove the duplicate definitions. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index e6a6b5160f75..7c7c8d98256f 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -96,10 +96,6 @@ int main(void) DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); #endif - DEFINE(VM_EXEC, VM_EXEC); - BLANK(); - DEFINE(PAGE_SZ, PAGE_SIZE); - BLANK(); DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); BLANK(); From b129125e1f963500fd1cbd19e4155a65ce922944 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:20 +0100 Subject: [PATCH 048/168] arm64: asm-offsets: remove DMA_{TO,FROM}_DEVICE The DMA_TO_DEVICE and DMA_FROM_DEVICE defintitons in asm-offsets duplicate the common defintions from (which used to live in ), and haven't been used from asseembly code since commit: 7eacf1858bc86fe9 ("arm64: mm: Remove assembly DMA cache maintenance wrappers") Remove them both. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-7-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 7c7c8d98256f..e5ffdc0163aa 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -96,9 +95,6 @@ int main(void) DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); #endif - DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); - DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); - BLANK(); DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); From 7bb32dc788ddd0adae1273e799b920a790610fac Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Oct 2024 13:39:21 +0100 Subject: [PATCH 049/168] arm64: asm-offsets: remove PREEMPT_DISABLE_OFFSET The PREEMPT_DISABLE_OFFSET definition was added in commit: 24534b3511828c66 ("arm64: assembler: add macros to conditionally yield the NEON under PREEMPT") ... but hasn't been used since commit: 3931261ecf46151a ("arm64: fpsimd: Bring cond_yield asm macro in line with new rules") Remove PREEMPT_DISABLE_OFFSET. Signed-off-by: Mark Rutland Cc: Will Deacon Link: https://lore.kernel.org/r/20241007123921.549340-8-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index e5ffdc0163aa..cb2e9567dca9 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -95,8 +94,6 @@ int main(void) DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs)); BLANK(); #endif - DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); - BLANK(); DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val)); From ac4ad5c09b34adb9c2937af874b63762fe66f725 Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Mon, 9 Sep 2024 07:11:14 +0000 Subject: [PATCH 050/168] arm64: insn: Simulate nop instruction for better uprobe performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2->v1: 1. Remove the simuation of STP and the related bits. 2. Use arm64_skip_faulting_instruction for single-stepping or FEAT_BTI scenario. As Andrii pointed out, the uprobe/uretprobe selftest bench run into a counterintuitive result that nop and push variants are much slower than ret variant [0]. The root cause lies in the arch_probe_analyse_insn(), which excludes 'nop' and 'stp' from the emulatable instructions list. This force the kernel returns to userspace and execute them out-of-line, then trapping back to kernel for running uprobe callback functions. This leads to a significant performance overhead compared to 'ret' variant, which is already emulated. Typicall uprobe is installed on 'nop' for USDT and on function entry which starts with the instrucion 'stp x29, x30, [sp, #imm]!' to push lr and fp into stack regardless kernel or userspace binary. In order to improve the performance of handling uprobe for common usecases. This patch supports the emulation of Arm64 equvialents instructions of 'nop' and 'push'. The benchmark results below indicates the performance gain of emulation is obvious. On Kunpeng916 (Hi1616), 4 NUMA nodes, 64 Arm64 cores@2.4GHz. xol (1 cpus) ------------ uprobe-nop: 0.916 ± 0.001M/s (0.916M/prod) uprobe-push: 0.908 ± 0.001M/s (0.908M/prod) uprobe-ret: 1.855 ± 0.000M/s (1.855M/prod) uretprobe-nop: 0.640 ± 0.000M/s (0.640M/prod) uretprobe-push: 0.633 ± 0.001M/s (0.633M/prod) uretprobe-ret: 0.978 ± 0.003M/s (0.978M/prod) emulation (1 cpus) ------------------- uprobe-nop: 1.862 ± 0.002M/s (1.862M/prod) uprobe-push: 1.743 ± 0.006M/s (1.743M/prod) uprobe-ret: 1.840 ± 0.001M/s (1.840M/prod) uretprobe-nop: 0.964 ± 0.004M/s (0.964M/prod) uretprobe-push: 0.936 ± 0.004M/s (0.936M/prod) uretprobe-ret: 0.940 ± 0.001M/s (0.940M/prod) As shown above, the performance gap between 'nop/push' and 'ret' variants has been significantly reduced. Due to the emulation of 'push' instruction needs to access userspace memory, it spent more cycles than the other. As Mark suggested [1], it is painful to emulate the correct atomicity and ordering properties of STP, especially when it interacts with MTE, POE, etc. So this patch just focus on the simuation of 'nop'. The simluation of STP and related changes will be addressed in a separate patch. [0] https://lore.kernel.org/all/CAEf4BzaO4eG6hr2hzXYpn+7Uer4chS0R99zLn02ezZ5YruVuQw@mail.gmail.com/ [1] https://lore.kernel.org/all/Zr3RN4zxF5XPgjEB@J2N7QTR9R3/ CC: Andrii Nakryiko CC: Mark Rutland Signed-off-by: Liao Chang Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20240909071114.1150053-1-liaochang1@huawei.com [catalin.marinas@arm.com: small tweaks following MarkR's comments] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/insn.h | 5 +++++ arch/arm64/kernel/probes/decode-insn.c | 9 +++++++++ arch/arm64/kernel/probes/simulate-insn.c | 6 ++++++ arch/arm64/kernel/probes/simulate-insn.h | 1 + 4 files changed, 21 insertions(+) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 8c0a36f72d6f..89bc18989b90 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -575,6 +575,11 @@ static __always_inline u32 aarch64_insn_gen_nop(void) return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP); } +static __always_inline bool aarch64_insn_is_nop(u32 insn) +{ + return insn == aarch64_insn_gen_nop(); +} + u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg, enum aarch64_insn_branch_type type); u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg, diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 41b100bcb041..e05249f57075 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -75,6 +75,15 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) enum probe_insn __kprobes arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api) { + /* + * While 'nop' instruction can execute in the out-of-line slot, + * simulating them in breakpoint handling offers better performance. + */ + if (aarch64_insn_is_nop(insn)) { + api->handler = simulate_nop; + return INSN_GOOD_NO_SLOT; + } + /* * Instructions reading or modifying the PC won't work from the XOL * slot. diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index b65334ab79d2..4c6d2d712fbd 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -196,3 +196,9 @@ simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs) instruction_pointer_set(regs, instruction_pointer(regs) + 4); } + +void __kprobes +simulate_nop(u32 opcode, long addr, struct pt_regs *regs) +{ + arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); +} diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h index e065dc92218e..efb2803ec943 100644 --- a/arch/arm64/kernel/probes/simulate-insn.h +++ b/arch/arm64/kernel/probes/simulate-insn.h @@ -16,5 +16,6 @@ void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs); void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs); +void simulate_nop(u32 opcode, long addr, struct pt_regs *regs); #endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */ From 7ffc13e233951f15728c9d09db3cc8d9f6cf81f2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 23 Sep 2024 21:13:50 +0800 Subject: [PATCH 051/168] arm64: tlbflush: add __flush_tlb_range_limit_excess() The __flush_tlb_range_limit_excess() helper will be used when flush tlb kernel range soon. Signed-off-by: Kefeng Wang Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240923131351.713304-2-wangkefeng.wang@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 95fbc8c05607..5f5e7d1f2e7d 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -431,6 +431,23 @@ do { \ #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \ __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled()); +static inline bool __flush_tlb_range_limit_excess(unsigned long start, + unsigned long end, unsigned long pages, unsigned long stride) +{ + /* + * When the system does not support TLB range based flush + * operation, (MAX_DVM_OPS - 1) pages can be handled. But + * with TLB range based operation, MAX_TLBI_RANGE_PAGES + * pages can be handled. + */ + if ((!system_supports_tlb_range() && + (end - start) >= (MAX_DVM_OPS * stride)) || + pages > MAX_TLBI_RANGE_PAGES) + return true; + + return false; +} + static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long stride, bool last_level, @@ -442,15 +459,7 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma, end = round_up(end, stride); pages = (end - start) >> PAGE_SHIFT; - /* - * When not uses TLB range ops, we can handle up to - * (MAX_DVM_OPS - 1) pages; - * When uses TLB range ops, we can handle up to - * MAX_TLBI_RANGE_PAGES pages. - */ - if ((!system_supports_tlb_range() && - (end - start) >= (MAX_DVM_OPS * stride)) || - pages > MAX_TLBI_RANGE_PAGES) { + if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { flush_tlb_mm(vma->vm_mm); return; } From a923705c69f7f4ebe6a5488c1f556bed12d28031 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 23 Sep 2024 21:13:51 +0800 Subject: [PATCH 052/168] arm64: optimize flush tlb kernel range Currently the kernel TLBs is flushed page by page if the target VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll brutally issue a TLBI ALL. But we could optimize it when CPU supports TLB range operations, convert to use __flush_tlb_range_op() like other tlb range flush to improve performance. Co-developed-by: Yicong Yang Signed-off-by: Yicong Yang Signed-off-by: Kefeng Wang Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240923131351.713304-3-wangkefeng.wang@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 5f5e7d1f2e7d..bc94e036a26b 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -501,19 +501,21 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - unsigned long addr; + const unsigned long stride = PAGE_SIZE; + unsigned long pages; - if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) { + start = round_down(start, stride); + end = round_up(end, stride); + pages = (end - start) >> PAGE_SHIFT; + + if (__flush_tlb_range_limit_excess(start, end, pages, stride)) { flush_tlb_all(); return; } - start = __TLBI_VADDR(start, 0); - end = __TLBI_VADDR(end, 0); - dsb(ishst); - for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) - __tlbi(vaale1is, addr); + __flush_tlb_range_op(vaale1is, start, pages, stride, 0, + TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); dsb(ish); isb(); } From 8ef41786d88fd429829bd143323168a9468e3be0 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 1 Oct 2024 10:28:04 +0530 Subject: [PATCH 053/168] arm64/mm: Change pgattr_change_is_safe() arguments as pteval_t pgattr_change_is_safe() processes two distinct page table entries that just happen to be 64 bits for all levels. This changes both arguments to reflect the actual data type being processed in the function. This change is important when moving to FEAT_D128 based 128 bit page tables because it makes it simple to change the entry size in one place. Cc: Will Deacon Cc: Mark Rutland Cc: Ard Biesheuvel Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Ryan Roberts Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241001045804.1119881-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 2 +- arch/arm64/mm/mmu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c329ea061dc9..d56dbe5742d6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -338,7 +338,7 @@ static inline pte_t __ptep_get(pte_t *ptep) } extern void __sync_icache_dcache(pte_t pteval); -bool pgattr_change_is_safe(u64 old, u64 new); +bool pgattr_change_is_safe(pteval_t old, pteval_t new); /* * PTE bits configuration in the presence of hardware Dirty Bit Management diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e55b02fbddc8..c1b2d0dc3078 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -119,7 +119,7 @@ static phys_addr_t __init early_pgtable_alloc(int shift) return phys; } -bool pgattr_change_is_safe(u64 old, u64 new) +bool pgattr_change_is_safe(pteval_t old, pteval_t new) { /* * The following mapping attributes may be updated in live From 25c17c4b55def92a01e3eecc9c775a6ee25ca20f Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 1 Oct 2024 15:52:19 -0700 Subject: [PATCH 054/168] hugetlb: arm64: add mte support Enable MTE support for hugetlb. The MTE page flags will be set on the folio only. When copying hugetlb folio (for example, CoW), the tags for all subpages will be copied when copying the first subpage. When freeing hugetlb folio, the MTE flags will be cleared. Reviewed-by: Catalin Marinas Reviewed-by: David Hildenbrand Signed-off-by: Yang Shi Link: https://lore.kernel.org/r/20241001225220.271178-1-yang@os.amperecomputing.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hugetlb.h | 8 ++++ arch/arm64/include/asm/mman.h | 3 +- arch/arm64/include/asm/mte.h | 67 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/hibernate.c | 6 +++ arch/arm64/kernel/mte.c | 27 ++++++++++++- arch/arm64/kvm/guest.c | 16 ++++++-- arch/arm64/kvm/mmu.c | 11 ++++++ arch/arm64/mm/copypage.c | 27 ++++++++++++- fs/hugetlbfs/inode.c | 2 +- 9 files changed, 159 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 293f880865e8..c6dff3e69539 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -11,6 +11,7 @@ #define __ASM_HUGETLB_H #include +#include #include #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION @@ -21,6 +22,13 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); static inline void arch_clear_hugetlb_flags(struct folio *folio) { clear_bit(PG_dcache_clean, &folio->flags); + +#ifdef CONFIG_ARM64_MTE + if (system_supports_mte()) { + clear_bit(PG_mte_tagged, &folio->flags); + clear_bit(PG_mte_lock, &folio->flags); + } +#endif } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 9e39217b4afb..65bc2b07f666 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -38,7 +38,8 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags) * backed by tags-capable memory. The vm_flags may be overridden by a * filesystem supporting MTE (RAM-based). */ - if (system_supports_mte() && (flags & MAP_ANONYMOUS)) + if (system_supports_mte() && + (flags & (MAP_ANONYMOUS | MAP_HUGETLB))) return VM_MTE_ALLOWED; return 0; diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 0f84518632b4..6567df8ec8ca 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -41,6 +41,8 @@ void mte_free_tag_storage(char *storage); static inline void set_page_mte_tagged(struct page *page) { + VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); + /* * Ensure that the tags written prior to this function are visible * before the page flags update. @@ -53,6 +55,8 @@ static inline bool page_mte_tagged(struct page *page) { bool ret = test_bit(PG_mte_tagged, &page->flags); + VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); + /* * If the page is tagged, ensure ordering with a likely subsequent * read of the tags. @@ -76,6 +80,8 @@ static inline bool page_mte_tagged(struct page *page) */ static inline bool try_page_mte_tagging(struct page *page) { + VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); + if (!test_and_set_bit(PG_mte_lock, &page->flags)) return true; @@ -157,6 +163,67 @@ static inline int mte_ptrace_copy_tags(struct task_struct *child, #endif /* CONFIG_ARM64_MTE */ +#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE) +static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) +{ + VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); + + /* + * Ensure that the tags written prior to this function are visible + * before the folio flags update. + */ + smp_wmb(); + set_bit(PG_mte_tagged, &folio->flags); + +} + +static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) +{ + bool ret = test_bit(PG_mte_tagged, &folio->flags); + + VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); + + /* + * If the folio is tagged, ensure ordering with a likely subsequent + * read of the tags. + */ + if (ret) + smp_rmb(); + return ret; +} + +static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) +{ + VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); + + if (!test_and_set_bit(PG_mte_lock, &folio->flags)) + return true; + + /* + * The tags are either being initialised or may have been initialised + * already. Check if the PG_mte_tagged flag has been set or wait + * otherwise. + */ + smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); + + return false; +} +#else +static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) +{ +} + +static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) +{ + return false; +} + +static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) +{ + return false; +} +#endif + static inline void mte_disable_tco_entry(struct task_struct *task) { if (!system_supports_mte()) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 7b11d84f533c..18749e9a6c2d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -266,9 +266,15 @@ static int swsusp_mte_save_tags(void) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { struct page *page = pfn_to_online_page(pfn); + struct folio *folio; if (!page) continue; + folio = page_folio(page); + + if (folio_test_hugetlb(folio) && + !folio_test_hugetlb_mte_tagged(folio)) + continue; if (!page_mte_tagged(page)) continue; diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 6174671be7c1..2fbfd27ff5f2 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -38,7 +38,24 @@ EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); void mte_sync_tags(pte_t pte, unsigned int nr_pages) { struct page *page = pte_page(pte); - unsigned int i; + struct folio *folio = page_folio(page); + unsigned long i; + + if (folio_test_hugetlb(folio)) { + unsigned long nr = folio_nr_pages(folio); + + /* Hugetlb MTE flags are set for head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + + /* ensure the tags are visible before the PTE is set */ + smp_wmb(); + + return; + } /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { @@ -410,6 +427,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, void *maddr; struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); + struct folio *folio; if (IS_ERR(page)) { err = PTR_ERR(page); @@ -428,7 +446,12 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, put_page(page); break; } - WARN_ON_ONCE(!page_mte_tagged(page)); + + folio = page_folio(page); + if (folio_test_hugetlb(folio)) + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); + else + WARN_ON_ONCE(!page_mte_tagged(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 962f985977c2..e738a353b20e 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -1055,6 +1055,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, void *maddr; unsigned long num_tags; struct page *page; + struct folio *folio; if (is_error_noslot_pfn(pfn)) { ret = -EFAULT; @@ -1068,10 +1069,13 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, ret = -EFAULT; goto out; } + folio = page_folio(page); maddr = page_address(page); if (!write) { - if (page_mte_tagged(page)) + if ((folio_test_hugetlb(folio) && + folio_test_hugetlb_mte_tagged(folio)) || + page_mte_tagged(page)) num_tags = mte_copy_tags_to_user(tags, maddr, MTE_GRANULES_PER_PAGE); else @@ -1085,14 +1089,20 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, * __set_ptes() in the VMM but still overriding the * tags, hence ignoring the return value. */ - try_page_mte_tagging(page); + if (folio_test_hugetlb(folio)) + folio_try_hugetlb_mte_tagging(folio); + else + try_page_mte_tagging(page); num_tags = mte_copy_tags_from_user(maddr, tags, MTE_GRANULES_PER_PAGE); /* uaccess failed, don't leave stale tags */ if (num_tags != MTE_GRANULES_PER_PAGE) mte_clear_page_tags(maddr); - set_page_mte_tagged(page); + if (folio_test_hugetlb(folio)) + folio_set_hugetlb_mte_tagged(folio); + else + set_page_mte_tagged(page); kvm_release_pfn_dirty(pfn); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a509b63bd4dd..962449f9ac2f 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1401,10 +1401,21 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn, { unsigned long i, nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(pfn); + struct folio *folio = page_folio(page); if (!kvm_has_mte(kvm)) return; + if (folio_test_hugetlb(folio)) { + /* Hugetlb has MTE flags set on head page only */ + if (folio_try_hugetlb_mte_tagging(folio)) { + for (i = 0; i < nr_pages; i++, page++) + mte_clear_page_tags(page_address(page)); + folio_set_hugetlb_mte_tagged(folio); + } + return; + } + for (i = 0; i < nr_pages; i++, page++) { if (try_page_mte_tagging(page)) { mte_clear_page_tags(page_address(page)); diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index a7bb20055ce0..87b3f1a25535 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -18,15 +18,40 @@ void copy_highpage(struct page *to, struct page *from) { void *kto = page_address(to); void *kfrom = page_address(from); + struct folio *src = page_folio(from); + struct folio *dst = page_folio(to); + unsigned int i, nr_pages; copy_page(kto, kfrom); if (kasan_hw_tags_enabled()) page_kasan_tag_reset(to); - if (system_supports_mte() && page_mte_tagged(from)) { + if (!system_supports_mte()) + return; + + if (folio_test_hugetlb(src) && + folio_test_hugetlb_mte_tagged(src)) { + if (!folio_try_hugetlb_mte_tagging(dst)) + return; + + /* + * Populate tags for all subpages. + * + * Don't assume the first page is head page since + * huge page copy may start from any subpage. + */ + nr_pages = folio_nr_pages(src); + for (i = 0; i < nr_pages; i++) { + kfrom = page_address(folio_page(src, i)); + kto = page_address(folio_page(dst, i)); + mte_copy_page_tags(kto, kfrom); + } + folio_set_hugetlb_mte_tagged(dst); + } else if (page_mte_tagged(from)) { /* It's a new page, shouldn't have been tagged yet */ WARN_ON_ONCE(!try_page_mte_tagging(to)); + mte_copy_page_tags(kto, kfrom); set_page_mte_tagged(to); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5cf327337e22..f26b3b53d7de 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); + vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_write(info->seals, vma); From 27879e8cb6b0fdb5cdcd76685f290729309711c6 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 1 Oct 2024 15:52:20 -0700 Subject: [PATCH 055/168] selftests: arm64: add hugetlb mte tests The tests cover mmap, mprotect hugetlb with MTE prot and COW. Signed-off-by: Yang Shi Link: https://lore.kernel.org/r/20241001225220.271178-2-yang@os.amperecomputing.com Signed-off-by: Catalin Marinas --- .../arm64/mte/check_hugetlb_options.c | 285 ++++++++++++++++++ 1 file changed, 285 insertions(+) create mode 100644 tools/testing/selftests/arm64/mte/check_hugetlb_options.c diff --git a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c new file mode 100644 index 000000000000..303260a6dc65 --- /dev/null +++ b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2024 Ampere Computing LLC + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest.h" +#include "mte_common_util.h" +#include "mte_def.h" + +#define TAG_CHECK_ON 0 +#define TAG_CHECK_OFF 1 + +static unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +static bool is_hugetlb_allocated(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return false; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugetlb: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + + if (hps > 0) + return true; + + return false; +} + +static void write_sysfs(char *str, unsigned long val) +{ + FILE *f; + + f = fopen(str, "w"); + if (!f) { + ksft_print_msg("ERR: missing %s\n", str); + return; + } + fprintf(f, "%lu", val); + fclose(f); +} + +static void allocate_hugetlb() +{ + write_sysfs("/proc/sys/vm/nr_hugepages", 2); +} + +static void free_hugetlb() +{ + write_sysfs("/proc/sys/vm/nr_hugepages", 0); +} + +static int check_child_tag_inheritance(char *ptr, int size, int mode) +{ + int i, parent_tag, child_tag, fault, child_status; + pid_t child; + + parent_tag = MT_FETCH_TAG((uintptr_t)ptr); + fault = 0; + + child = fork(); + if (child == -1) { + ksft_print_msg("FAIL: child process creation\n"); + return KSFT_FAIL; + } else if (child == 0) { + mte_initialize_current_context(mode, (uintptr_t)ptr, size); + /* Do copy on write */ + memset(ptr, '1', size); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid == true) { + fault = 1; + goto check_child_tag_inheritance_err; + } + for (i = 0; i < size; i += MT_GRANULE_SIZE) { + child_tag = MT_FETCH_TAG((uintptr_t)(mte_get_tag_address(ptr + i))); + if (parent_tag != child_tag) { + ksft_print_msg("FAIL: child mte tag (%d) mismatch\n", i); + fault = 1; + goto check_child_tag_inheritance_err; + } + } +check_child_tag_inheritance_err: + _exit(fault); + } + /* Wait for child process to terminate */ + wait(&child_status); + if (WIFEXITED(child_status)) + fault = WEXITSTATUS(child_status); + else + fault = 1; + return (fault) ? KSFT_FAIL : KSFT_PASS; +} + +static int check_mte_memory(char *ptr, int size, int mode, int tag_check) +{ + mte_initialize_current_context(mode, (uintptr_t)ptr, size); + memset(ptr, '1', size); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid == true) + return KSFT_FAIL; + + return KSFT_PASS; +} + +static int check_hugetlb_memory_mapping(int mem_type, int mode, int mapping, int tag_check) +{ + char *ptr, *map_ptr; + int result; + unsigned long map_size; + + map_size = default_huge_page_size(); + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + map_ptr = (char *)mte_allocate_memory(map_size, mem_type, mapping, false); + if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + mte_initialize_current_context(mode, (uintptr_t)map_ptr, map_size); + /* Only mte enabled memory will allow tag insertion */ + ptr = mte_insert_tags((void *)map_ptr, map_size); + if (!ptr || cur_mte_cxt.fault_valid == true) { + ksft_print_msg("FAIL: Insert tags on anonymous mmap memory\n"); + munmap((void *)map_ptr, map_size); + return KSFT_FAIL; + } + result = check_mte_memory(ptr, map_size, mode, tag_check); + mte_clear_tags((void *)ptr, map_size); + mte_free_memory((void *)map_ptr, map_size, mem_type, false); + if (result == KSFT_FAIL) + return KSFT_FAIL; + + return KSFT_PASS; +} + +static int check_clear_prot_mte_flag(int mem_type, int mode, int mapping) +{ + char *map_ptr; + int prot_flag, result; + unsigned long map_size; + + prot_flag = PROT_READ | PROT_WRITE; + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + map_size = default_huge_page_size(); + map_ptr = (char *)mte_allocate_memory_tag_range(map_size, mem_type, mapping, + 0, 0); + if (check_allocated_memory_range(map_ptr, map_size, mem_type, + 0, 0) != KSFT_PASS) + return KSFT_FAIL; + /* Try to clear PROT_MTE property and verify it by tag checking */ + if (mprotect(map_ptr, map_size, prot_flag)) { + mte_free_memory_tag_range((void *)map_ptr, map_size, mem_type, + 0, 0); + ksft_print_msg("FAIL: mprotect not ignoring clear PROT_MTE property\n"); + return KSFT_FAIL; + } + result = check_mte_memory(map_ptr, map_size, mode, TAG_CHECK_ON); + mte_free_memory_tag_range((void *)map_ptr, map_size, mem_type, 0, 0); + if (result != KSFT_PASS) + return KSFT_FAIL; + + return KSFT_PASS; +} + +static int check_child_hugetlb_memory_mapping(int mem_type, int mode, int mapping) +{ + char *ptr; + int result; + unsigned long map_size; + + map_size = default_huge_page_size(); + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + ptr = (char *)mte_allocate_memory_tag_range(map_size, mem_type, mapping, + 0, 0); + if (check_allocated_memory_range(ptr, map_size, mem_type, + 0, 0) != KSFT_PASS) + return KSFT_FAIL; + result = check_child_tag_inheritance(ptr, map_size, mode); + mte_free_memory_tag_range((void *)ptr, map_size, mem_type, 0, 0); + if (result == KSFT_FAIL) + return result; + + return KSFT_PASS; +} + +int main(int argc, char *argv[]) +{ + int err; + + err = mte_default_setup(); + if (err) + return err; + + /* Register signal handlers */ + mte_register_signal(SIGBUS, mte_default_handler); + mte_register_signal(SIGSEGV, mte_default_handler); + + allocate_hugetlb(); + + if (!is_hugetlb_allocated()) { + ksft_print_msg("ERR: Unable allocate hugetlb pages\n"); + return KSFT_FAIL; + } + + /* Set test plan */ + ksft_set_plan(12); + + mte_enable_pstate_tco(); + + evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_OFF), + "Check hugetlb memory with private mapping, sync error mode, mmap memory and tag check off\n"); + + mte_disable_pstate_tco(); + evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_NONE_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_OFF), + "Check hugetlb memory with private mapping, no error mode, mmap memory and tag check off\n"); + + evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON), + "Check hugetlb memory with private mapping, sync error mode, mmap memory and tag check on\n"); + evaluate_test(check_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON), + "Check hugetlb memory with private mapping, sync error mode, mmap/mprotect memory and tag check on\n"); + evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON), + "Check hugetlb memory with private mapping, async error mode, mmap memory and tag check on\n"); + evaluate_test(check_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON), + "Check hugetlb memory with private mapping, async error mode, mmap/mprotect memory and tag check on\n"); + + evaluate_test(check_clear_prot_mte_flag(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), + "Check clear PROT_MTE flags with private mapping, sync error mode and mmap memory\n"); + evaluate_test(check_clear_prot_mte_flag(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), + "Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n"); + + evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), + "Check child hugetlb memory with private mapping, precise mode and mmap memory\n"); + evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), + "Check child hugetlb memory with private mapping, precise mode and mmap memory\n"); + evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), + "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n"); + evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), + "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n"); + + mte_restore_setup(); + free_hugetlb(); + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; +} From 48f8d9cef766f8ed4bbccc0d759710262d34f40b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 5 Oct 2024 01:17:18 +0100 Subject: [PATCH 056/168] kselftest/arm64: Validate that GCS push and write permissions work Add trivial assembly programs which give themselves the appropriate permissions and then execute GCSPUSHM and GCSSTR, they will report errors by generating signals on the non-permitted instructions. Not using libc minimises the interaction with any policy set for the system but we skip on failure to get the permissions in case the system is locked down to make them inaccessible. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241005-arm64-gcs-test-flags-v1-1-03cb9786c5cd@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/gcs/.gitignore | 2 + tools/testing/selftests/arm64/gcs/Makefile | 8 +- tools/testing/selftests/arm64/gcs/gcspushm.S | 96 +++++++++++++++++++ tools/testing/selftests/arm64/gcs/gcsstr.S | 99 ++++++++++++++++++++ 4 files changed, 204 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/gcs/gcspushm.S create mode 100644 tools/testing/selftests/arm64/gcs/gcsstr.S diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore index 1e8d1f6b27f2..bbb8e40a7e52 100644 --- a/tools/testing/selftests/arm64/gcs/.gitignore +++ b/tools/testing/selftests/arm64/gcs/.gitignore @@ -3,3 +3,5 @@ libc-gcs gcs-locking gcs-stress gcs-stress-thread +gcspushm +gcsstr diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile index d8b06ca51e22..d2f3497a9103 100644 --- a/tools/testing/selftests/arm64/gcs/Makefile +++ b/tools/testing/selftests/arm64/gcs/Makefile @@ -6,7 +6,7 @@ # nolibc. # -TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress +TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress gcspushm gcsstr TEST_GEN_PROGS_EXTENDED := gcs-stress-thread LDLIBS+=-lpthread @@ -22,3 +22,9 @@ $(OUTPUT)/basic-gcs: basic-gcs.c $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S $(CC) -nostdlib $^ -o $@ + +$(OUTPUT)/gcspushm: gcspushm.S + $(CC) -nostdlib $^ -o $@ + +$(OUTPUT)/gcsstr: gcsstr.S + $(CC) -nostdlib $^ -o $@ diff --git a/tools/testing/selftests/arm64/gcs/gcspushm.S b/tools/testing/selftests/arm64/gcs/gcspushm.S new file mode 100644 index 000000000000..bbe17c1325ac --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/gcspushm.S @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// Copyright 2024 Arm Limited +// +// Give ourselves GCS push permissions then use them + +#include + +/* Shadow Stack/Guarded Control Stack interface */ +#define PR_GET_SHADOW_STACK_STATUS 74 +#define PR_SET_SHADOW_STACK_STATUS 75 +#define PR_LOCK_SHADOW_STACK_STATUS 76 + +# define PR_SHADOW_STACK_ENABLE (1UL << 0) +# define PR_SHADOW_STACK_WRITE (1UL << 1) +# define PR_SHADOW_STACK_PUSH (1UL << 2) + +#define KSFT_SKIP 4 + +.macro function name + .macro endfunction + .type \name, @function + .purgem endfunction + .endm +\name: +.endm + +// Print a single character x0 to stdout +// Clobbers x0-x2,x8 +function putc + str x0, [sp, #-16]! + + mov x0, #1 // STDOUT_FILENO + mov x1, sp + mov x2, #1 + mov x8, #__NR_write + svc #0 + + add sp, sp, #16 + ret +endfunction +.globl putc + +// Print a NUL-terminated string starting at address x0 to stdout +// Clobbers x0-x3,x8 +function puts + mov x1, x0 + + mov x2, #0 +0: ldrb w3, [x0], #1 + cbz w3, 1f + add x2, x2, #1 + b 0b + +1: mov w0, #1 // STDOUT_FILENO + mov x8, #__NR_write + svc #0 + + ret +endfunction +.globl puts + +// Utility macro to print a literal string +// Clobbers x0-x4,x8 +.macro puts string + .pushsection .rodata.str1.1, "aMS", @progbits, 1 +.L__puts_literal\@: .string "\string" + .popsection + + ldr x0, =.L__puts_literal\@ + bl puts +.endm + +.globl _start +function _start + // Run with GCS + mov x0, PR_SET_SHADOW_STACK_STATUS + mov x1, PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x8, #__NR_prctl + svc #0 + cbz x0, 1f + puts "Failed to enable GCS with push permission\n" + mov x0, #KSFT_SKIP + b 2f +1: + sys #3, c7, c7, #0, x0 // GCSPUSHM + sysl x0, #3, c7, c7, #1 // GCSPOPM + + mov x0, #0 +2: + mov x8, #__NR_exit + svc #0 diff --git a/tools/testing/selftests/arm64/gcs/gcsstr.S b/tools/testing/selftests/arm64/gcs/gcsstr.S new file mode 100644 index 000000000000..a42bba6e30b1 --- /dev/null +++ b/tools/testing/selftests/arm64/gcs/gcsstr.S @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// Copyright 2024 Arm Limited +// +// Give ourselves GCS write permissions then use them + +#include + +/* Shadow Stack/Guarded Control Stack interface */ +#define PR_GET_SHADOW_STACK_STATUS 74 +#define PR_SET_SHADOW_STACK_STATUS 75 +#define PR_LOCK_SHADOW_STACK_STATUS 76 + +# define PR_SHADOW_STACK_ENABLE (1UL << 0) +# define PR_SHADOW_STACK_WRITE (1UL << 1) +# define PR_SHADOW_STACK_PUSH (1UL << 2) + +#define GCSPR_EL0 S3_3_C2_C5_1 + +#define KSFT_SKIP 4 + +.macro function name + .macro endfunction + .type \name, @function + .purgem endfunction + .endm +\name: +.endm + +// Print a single character x0 to stdout +// Clobbers x0-x2,x8 +function putc + str x0, [sp, #-16]! + + mov x0, #1 // STDOUT_FILENO + mov x1, sp + mov x2, #1 + mov x8, #__NR_write + svc #0 + + add sp, sp, #16 + ret +endfunction +.globl putc + +// Print a NUL-terminated string starting at address x0 to stdout +// Clobbers x0-x3,x8 +function puts + mov x1, x0 + + mov x2, #0 +0: ldrb w3, [x0], #1 + cbz w3, 1f + add x2, x2, #1 + b 0b + +1: mov w0, #1 // STDOUT_FILENO + mov x8, #__NR_write + svc #0 + + ret +endfunction +.globl puts + +// Utility macro to print a literal string +// Clobbers x0-x4,x8 +.macro puts string + .pushsection .rodata.str1.1, "aMS", @progbits, 1 +.L__puts_literal\@: .string "\string" + .popsection + + ldr x0, =.L__puts_literal\@ + bl puts +.endm + +.globl _start +function _start + // Run with GCS + mov x0, PR_SET_SHADOW_STACK_STATUS + mov x1, PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x8, #__NR_prctl + svc #0 + cbz x0, 1f + puts "Failed to enable GCS with write permission\n" + mov x0, #KSFT_SKIP + b 2f +1: + mrs x0, GCSPR_EL0 + sub x0, x0, #8 + .inst 0xd91f1c01 // GCSSTR x1, x0 + + mov x0, #0 +2: + mov x8, #__NR_exit + svc #0 From 9c4a25140dee5356254ddb921086c49fb93ba952 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Tue, 8 Oct 2024 15:01:21 +0100 Subject: [PATCH 057/168] arm64: cpufeature: add POE to cpucap_is_possible() Since de66cb37ab6 ("arm64: Add cpucap_is_possible()"), alternative_has_cap_unlikely() includes the IS_ENABLED() check. Add CONFIG_ARM64_POE to cpucap_is_possible() to avoid the explicit check. Signed-off-by: Joey Gouly Cc: Will Deacon Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241008140121.2774348-1-joey.gouly@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpucaps.h | 2 ++ arch/arm64/include/asm/cpufeature.h | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index a6e5b07b64fd..a08a1212ffbb 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -42,6 +42,8 @@ cpucap_is_possible(const unsigned int cap) return IS_ENABLED(CONFIG_ARM64_BTI); case ARM64_HAS_TLB_RANGE: return IS_ENABLED(CONFIG_ARM64_TLB_RANGE); + case ARM64_HAS_S1POE: + return IS_ENABLED(CONFIG_ARM64_POE); case ARM64_UNMAP_KERNEL_AT_EL0: return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0); case ARM64_WORKAROUND_843419: diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d261cc123c1..4b6bc0bac9b9 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -834,8 +834,7 @@ static inline bool system_supports_lpa2(void) static inline bool system_supports_poe(void) { - return IS_ENABLED(CONFIG_ARM64_POE) && - alternative_has_cap_unlikely(ARM64_HAS_S1POE); + return alternative_has_cap_unlikely(ARM64_HAS_S1POE); } int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); From 9b9be78258511e67767e4aa51f587cf22feb5065 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 11 Oct 2024 15:36:25 +0100 Subject: [PATCH 058/168] kselftest/arm64: Ensure stable names for GCS stress test results The GCS stress test program currently uses the PID of the threads it creates in the test names it reports, resulting in unstable test names between runs. Fix this by using a thread number instead. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241011-arm64-gcs-stress-stable-name-v1-1-4950f226218e@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/gcs/gcs-stress.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c index bdec7ee8cfd5..03222c36c436 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-stress.c +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -56,7 +56,7 @@ static int num_processors(void) return nproc; } -static void start_thread(struct child_data *child) +static void start_thread(struct child_data *child, int id) { int ret, pipefd[2], i; struct epoll_event ev; @@ -132,7 +132,7 @@ static void start_thread(struct child_data *child) ev.events = EPOLLIN | EPOLLHUP; ev.data.ptr = child; - ret = asprintf(&child->name, "Thread-%d", child->pid); + ret = asprintf(&child->name, "Thread-%d", id); if (ret == -1) ksft_exit_fail_msg("asprintf() failed\n"); @@ -437,7 +437,7 @@ int main(int argc, char **argv) tests); for (i = 0; i < gcs_threads; i++) - start_thread(&children[i]); + start_thread(&children[i], i); /* * All children started, close the startup pipe and let them From 0349934618907a0bfc9088282c526c2852d4ccaa Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 11 Oct 2024 11:30:25 +0100 Subject: [PATCH 059/168] arm64/sysreg: Update ID_AA64MMFR1_EL1 to DDI0601 2024-09 ID_AA64MMFR1_EL1 has been updated by the architecture to enumerate several new architectural features since the last time sysreg was updated, sync with the definnition in DD0601 2024-09 to include two new versions of each of ETS and HAFDBS. Reported-by: Marc Zyngier Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241011-arm64-aa64mmfr1-2024-09-v1-1-61935a085010@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8d637ac4b7c6..ae64ba810298 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1648,6 +1648,8 @@ EndEnum UnsignedEnum 39:36 ETS 0b0000 NI 0b0001 IMP + 0b0010 ETS2 + 0b0011 ETS3 EndEnum UnsignedEnum 35:32 TWED 0b0000 NI @@ -1688,6 +1690,8 @@ UnsignedEnum 3:0 HAFDBS 0b0000 NI 0b0001 AF 0b0010 DBM + 0b0011 HAFT + 0b0100 HDBSS EndEnum EndSysreg From 0f612c6eb13ad85a60e00c751c83e24f4a32cd0a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 14 Oct 2024 13:03:41 +1000 Subject: [PATCH 060/168] arm64: head: Drop SWAPPER_TABLE_SHIFT There is no users of SWAPPER_TABLE_SHIFT after commit 84b04d3e6bdb ("arm64: kernel: Create initial ID map from C code"). Just drop it. Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241014030341.995806-1-gshan@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kernel-pgtable.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index bf05a77873a4..fd5a08450b12 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -26,7 +26,6 @@ #define SWAPPER_SKIP_LEVEL 0 #endif #define SWAPPER_BLOCK_SIZE (UL(1) << SWAPPER_BLOCK_SHIFT) -#define SWAPPER_TABLE_SHIFT (SWAPPER_BLOCK_SHIFT + PAGE_SHIFT - 3) #define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - SWAPPER_SKIP_LEVEL) #define INIT_IDMAP_PGTABLE_LEVELS (IDMAP_LEVELS - SWAPPER_SKIP_LEVEL) From c56c599d9002d44f559be3852b371db46adac87c Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 30 Sep 2024 17:10:47 +0100 Subject: [PATCH 061/168] arm64: probes: Disable kprobes/uprobes on MOPS instructions FEAT_MOPS instructions require that all three instructions (prologue, main and epilogue) appear consecutively in memory. Placing a kprobe/uprobe on one of them doesn't work as only a single instruction gets executed out-of-line or simulated. So don't allow placing a probe on a MOPS instruction. Fixes: b7564127ffcb ("arm64: mops: detect and enable FEAT_MOPS") Signed-off-by: Kristina Martsenko Link: https://lore.kernel.org/r/20240930161051.3777828-2-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/insn.h | 1 + arch/arm64/kernel/probes/decode-insn.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 8c0a36f72d6f..bc77869dbd43 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -353,6 +353,7 @@ __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) __AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000) __AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000) +__AARCH64_INSN_FUNCS(mops, 0x3B200C00, 0x19000400) __AARCH64_INSN_FUNCS(stp, 0x7FC00000, 0x29000000) __AARCH64_INSN_FUNCS(ldp, 0x7FC00000, 0x29400000) __AARCH64_INSN_FUNCS(stp_post, 0x7FC00000, 0x28800000) diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 968d5fffe233..77f3c8eb0916 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -58,10 +58,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * Instructions which load PC relative literals are not going to work * when executed from an XOL slot. Instructions doing an exclusive * load/store are not going to complete successfully when single-step - * exception handling happens in the middle of the sequence. + * exception handling happens in the middle of the sequence. Memory + * copy/set instructions require that all three instructions be placed + * consecutively in memory. */ if (aarch64_insn_uses_literal(insn) || - aarch64_insn_is_exclusive(insn)) + aarch64_insn_is_exclusive(insn) || + aarch64_insn_is_mops(insn)) return false; return true; From 13840229d6bd5c191a9ca68ceba0af0fa03d7645 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 30 Sep 2024 17:10:48 +0100 Subject: [PATCH 062/168] arm64: mops: Handle MOPS exceptions from EL1 We will soon be using MOPS instructions in the kernel, so wire up the exception handler to handle exceptions from EL1 caused by the copy/set operation being stopped and resumed on a different type of CPU. Add a helper for advancing the single step state machine, similarly to what the EL0 exception handler does. Signed-off-by: Kristina Martsenko Link: https://lore.kernel.org/r/20240930161051.3777828-3-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/debug-monitors.h | 1 + arch/arm64/include/asm/exception.h | 1 + arch/arm64/kernel/debug-monitors.c | 5 +++++ arch/arm64/kernel/entry-common.c | 12 ++++++++++++ arch/arm64/kernel/traps.c | 7 +++++++ 5 files changed, 26 insertions(+) diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 13d437bcbf58..8f6ba31b8658 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -105,6 +105,7 @@ void kernel_enable_single_step(struct pt_regs *regs); void kernel_disable_single_step(void); int kernel_active_single_step(void); void kernel_rewind_single_step(struct pt_regs *regs); +void kernel_fastforward_single_step(struct pt_regs *regs); #ifdef CONFIG_HAVE_HW_BREAKPOINT int reinstall_suspended_bps(struct pt_regs *regs); diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index f296662590c7..8689b95f6b53 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -73,6 +73,7 @@ void do_el0_svc_compat(struct pt_regs *regs); void do_el0_fpac(struct pt_regs *regs, unsigned long esr); void do_el1_fpac(struct pt_regs *regs, unsigned long esr); void do_el0_mops(struct pt_regs *regs, unsigned long esr); +void do_el1_mops(struct pt_regs *regs, unsigned long esr); void do_serror(struct pt_regs *regs, unsigned long esr); void do_signal(struct pt_regs *regs); diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 024a7b245056..c60a4a90c6a5 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -441,6 +441,11 @@ void kernel_rewind_single_step(struct pt_regs *regs) set_regs_spsr_ss(regs); } +void kernel_fastforward_single_step(struct pt_regs *regs) +{ + clear_regs_spsr_ss(regs); +} + /* ptrace API */ void user_enable_single_step(struct task_struct *task) { diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 3fcd9d080bf2..9d174cd541ef 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -463,6 +463,15 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr) exit_to_kernel_mode(regs); } +static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr) +{ + enter_from_kernel_mode(regs); + local_daif_inherit(regs); + do_el1_mops(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); +} + static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -505,6 +514,9 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BTI: el1_bti(regs, esr); break; + case ESR_ELx_EC_MOPS: + el1_mops(regs, esr); + break; case ESR_ELx_EC_BREAKPT_CUR: case ESR_ELx_EC_SOFTSTP_CUR: case ESR_ELx_EC_WATCHPT_CUR: diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 563cbce11126..fc6d44e06b8d 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -531,6 +531,13 @@ void do_el0_mops(struct pt_regs *regs, unsigned long esr) user_fastforward_single_step(current); } +void do_el1_mops(struct pt_regs *regs, unsigned long esr) +{ + arm64_mops_reset_regs(®s->user_regs, esr); + + kernel_fastforward_single_step(regs); +} + #define __user_cache_maint(insn, address, res) \ if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ From b616058c6613e1fd3e2bc8d4c05b558c8854aab3 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 30 Sep 2024 17:10:49 +0100 Subject: [PATCH 063/168] arm64: mops: Document booting requirement for HCR_EL2.MCE2 Document that hypervisors must set HCR_EL2.MCE2 and handle MOPS exceptions when they migrate a vCPU to another type of CPU, as Linux may not be able to handle the exception at all times. As one example, when running under nested virtualization, KVM does not handle MOPS exceptions from the nVHE/hVHE EL2 hyp as the hyp is never migrated, so the host hypervisor needs to handle them. There may be other situations (now or in the future) where the kernel can't handle an unexpected MOPS exception, so require that the hypervisor handles them. Signed-off-by: Kristina Martsenko Link: https://lore.kernel.org/r/20240930161051.3777828-4-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/booting.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index b57776a68f15..db46af5b9f0f 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -385,6 +385,9 @@ Before jumping into the kernel, the following conditions must be met: - HCRX_EL2.MSCEn (bit 11) must be initialised to 0b1. + - HCRX_EL2.MCE2 (bit 10) must be initialised to 0b1. The exception + handler must set PSTATE.SS to 0b0. + For CPUs with the Extended Translation Control Register feature (FEAT_TCR2): - If EL3 is present: From 836ed3c4e473fef9e0814a2ba6dd40f9656c03f1 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 30 Sep 2024 17:10:50 +0100 Subject: [PATCH 064/168] arm64: lib: Use MOPS for memcpy() routines Make memcpy(), memmove() and memset() use the Armv8.8 FEAT_MOPS instructions when implemented on the CPU. The CPY*/SET* instructions copy or set a block of memory of arbitrary size and alignment. They can be interrupted by the CPU and the copying resumed later. Their performance is expected to be close to the best generic copy/set sequence of loads/stores for a given CPU. Using them in the kernel's copy/set routines therefore avoids the need to periodically rewrite the routines to optimize for new microarchitectures. It could also lead to a performance improvement for some CPUs and systems. With this change the kernel will always use the instructions if they are implemented on the CPU (and have not been disabled by the arm64.nomops command line parameter). When not implemented the usual routines will be used (patched via alternatives). Note, we need to patch B/NOP instead of the whole sequence to avoid executing a partially patched sequence in case the compiler generates a mem*() call inside the alternatives patching code. Note that MOPS instructions have relaxed behavior on Device memory, but it is expected that these routines are not generally used on MMIO. Note: For memcpy(), this uses the CPY* instructions instead of CPYF*, as CPY* allows overlaps between the source and destination buffers, and despite contradicting the C standard, compilers require that memcpy() work on exactly overlapping source and destination: https://gcc.gnu.org/onlinedocs/gcc/Standards.html#C-Language https://reviews.llvm.org/D86993 Signed-off-by: Kristina Martsenko Link: https://lore.kernel.org/r/20240930161051.3777828-5-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 3 +++ arch/arm64/lib/memcpy.S | 19 ++++++++++++++++++- arch/arm64/lib/memset.S | 20 +++++++++++++++++++- 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3e29b44d2d7b..d0fe90ea704d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2155,6 +2155,9 @@ config ARM64_EPAN if the cpu does not implement the feature. endmenu # "ARMv8.7 architectural features" +config AS_HAS_MOPS + def_bool $(as-instr,.arch_extension mops) + menu "ARMv8.9 architectural features" config ARM64_POE diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 4ab48d49c451..9b99106fb95f 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -57,7 +57,7 @@ The loop tail is handled by always copying 64 bytes from the end. */ -SYM_FUNC_START(__pi_memcpy) +SYM_FUNC_START_LOCAL(__pi_memcpy_generic) add srcend, src, count add dstend, dstin, count cmp count, 128 @@ -238,7 +238,24 @@ L(copy64_from_start): stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstin] ret +SYM_FUNC_END(__pi_memcpy_generic) + +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +SYM_FUNC_START(__pi_memcpy) +alternative_if_not ARM64_HAS_MOPS + b __pi_memcpy_generic +alternative_else_nop_endif + + mov dst, dstin + cpyp [dst]!, [src]!, count! + cpym [dst]!, [src]!, count! + cpye [dst]!, [src]!, count! + ret SYM_FUNC_END(__pi_memcpy) +#else +SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic) +#endif SYM_FUNC_ALIAS(__memcpy, __pi_memcpy) EXPORT_SYMBOL(__memcpy) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index a5aebe82ad73..97157da65ec6 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S @@ -26,6 +26,7 @@ */ dstin .req x0 +val_x .req x1 val .req w1 count .req x2 tmp1 .req x3 @@ -42,7 +43,7 @@ dst .req x8 tmp3w .req w9 tmp3 .req x9 -SYM_FUNC_START(__pi_memset) +SYM_FUNC_START_LOCAL(__pi_memset_generic) mov dst, dstin /* Preserve return value. */ and A_lw, val, #255 orr A_lw, A_lw, A_lw, lsl #8 @@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset) ands count, count, zva_bits_x b.ne .Ltail_maybe_long ret +SYM_FUNC_END(__pi_memset_generic) + +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +SYM_FUNC_START(__pi_memset) +alternative_if_not ARM64_HAS_MOPS + b __pi_memset_generic +alternative_else_nop_endif + + mov dst, dstin + setp [dst]!, count!, val_x + setm [dst]!, count!, val_x + sete [dst]!, count!, val_x + ret SYM_FUNC_END(__pi_memset) +#else +SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic) +#endif SYM_FUNC_ALIAS(__memset, __pi_memset) EXPORT_SYMBOL(__memset) From ce6b5ff5f16dd9267d62d09b3af3f0c7dc3c24f0 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 30 Sep 2024 17:10:51 +0100 Subject: [PATCH 065/168] arm64: lib: Use MOPS for copy_page() and clear_page() Similarly to what was done to the memcpy() routines, make copy_page() and clear_page() also use the Armv8.8 FEAT_MOPS instructions. Note: For copy_page() this uses the CPY* instructions instead of CPYF* as CPYF* doesn't allow src and dst to be equal. It's not clear if copy_page() needs to allow equal src and dst but it has worked so far with the current implementation and there is no documentation forbidding it. Note, the unoptimized version of copy_page() in assembler.h is left as it is. Signed-off-by: Kristina Martsenko Link: https://lore.kernel.org/r/20240930161051.3777828-6-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/lib/clear_page.S | 13 +++++++++++++ arch/arm64/lib/copy_page.S | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S index ebde40e7fa2b..bd6f7d5eb6eb 100644 --- a/arch/arm64/lib/clear_page.S +++ b/arch/arm64/lib/clear_page.S @@ -15,6 +15,19 @@ * x0 - dest */ SYM_FUNC_START(__pi_clear_page) +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +alternative_if_not ARM64_HAS_MOPS + b .Lno_mops +alternative_else_nop_endif + + mov x1, #PAGE_SIZE + setpn [x0]!, x1!, xzr + setmn [x0]!, x1!, xzr + seten [x0]!, x1!, xzr + ret +.Lno_mops: +#endif mrs x1, dczid_el0 tbnz x1, #4, 2f /* Branch if DC ZVA is prohibited */ and w1, w1, #0xf diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S index 6a56d7cf309d..e6374e7e5511 100644 --- a/arch/arm64/lib/copy_page.S +++ b/arch/arm64/lib/copy_page.S @@ -18,6 +18,19 @@ * x1 - src */ SYM_FUNC_START(__pi_copy_page) +#ifdef CONFIG_AS_HAS_MOPS + .arch_extension mops +alternative_if_not ARM64_HAS_MOPS + b .Lno_mops +alternative_else_nop_endif + + mov x2, #PAGE_SIZE + cpypwn [x0]!, [x1]!, x2! + cpymwn [x0]!, [x1]!, x2! + cpyewn [x0]!, [x1]!, x2! + ret +.Lno_mops: +#endif ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] From c87df9cb9a216bc87951087a2592633cdc2737d4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:29 +0100 Subject: [PATCH 066/168] arm64: pt_regs: assert pt_regs is a multiple of 16 bytes To ensure that the stack is correctly aligned when branching to C code, we require that struct pt_regs is a multiple of 16 bytes, as noted in a comment. Add an explicit assertion for this, so that any accidental violation of this requirement will be caught by the compiler. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-2-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 0abe975d68a8..10eca3a3fed4 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -149,8 +149,7 @@ static inline unsigned long pstate_to_compat_psr(const unsigned long pstate) /* * This struct defines the way the registers are stored on the stack during an - * exception. Note that sizeof(struct pt_regs) has to be a multiple of 16 (for - * stack alignment). struct user_pt_regs must form a prefix of struct pt_regs. + * exception. struct user_pt_regs must form a prefix of struct pt_regs. */ struct pt_regs { union { @@ -180,6 +179,9 @@ struct pt_regs { u64 exit_rcu; }; +/* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */ +static_assert(IS_ALIGNED(sizeof(struct pt_regs), 16)); + static inline bool in_syscall(struct pt_regs const *regs) { return regs->syscallno != NO_SYSCALL; From 2716d59bf48328ea1bc2a90d81488f715d4cdea4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:30 +0100 Subject: [PATCH 067/168] arm64: pt_regs: remove stale big-endian layout For historical reasons the layout of struct pt_regs depends on the configured endianness, with the order of the 'syscallno' and 'unused2' fields varying dependent upon whether __AARCH64EB__ is defined. We no longer depend on the order of these two fields and can remove the ifdeffery. The current conditional layout was introduced in commit: 35d0e6fb4d219d64 ("arm64: syscallno is secretly an int, make it official") At the time, this was necessary so that the entry assembly could use a single STP instruction to save the pt_regs::{orig_x0,syscallno} fields, without logic that was conditional on the endianness of the kernel: | el0_svc_naked: | stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and syscall number This logic was converted to C in commit: f37099b6992a0b81 ("arm64: convert syscall trace logic to C") Since that commit, we no longer manipulate pt_regs::orig_x0 from assembly, and only manipulate pt_regs::syscallno as a 32-bit quantity early in the kernel_entry assembly: | /* Not in a syscall by default (el0_svc overwrites for real syscall) */ | .if \el == 0 | mov w21, #NO_SYSCALL | str w21, [sp, #S_SYSCALLNO] | .endif Given the above, there's no longer a need for the layout of pt_regs::{syscallno,unused2} to depend on the endianness of the kernel. This patch removes the ifdeffery and places 'syscallno' before 'unused2' regardless of the endianess of the kernel. At the same time, 'unused2' is renamed to 'unused', as it is the only unused field within pt_regs. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-3-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 10eca3a3fed4..1ae671fe64d8 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -162,13 +162,9 @@ struct pt_regs { }; }; u64 orig_x0; -#ifdef __AARCH64EB__ - u32 unused2; s32 syscallno; -#else - s32 syscallno; - u32 unused2; -#endif + u32 unused; + u64 sdei_ttbr1; /* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */ u64 pmr_save; From 00d9597903d0053bb13c74dfe61bcba35e213bd7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:31 +0100 Subject: [PATCH 068/168] arm64: pt_regs: rename "pmr_save" -> "pmr" The pt_regs::pmr_save field is weirdly named relative to all other pt_regs fields, with a '_save' suffix that doesn't make anything clearer and only leads to more typing to access the field. Remove the '_save' suffix. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-4-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/daifflags.h | 2 +- arch/arm64/include/asm/processor.h | 2 +- arch/arm64/include/asm/ptrace.h | 4 ++-- arch/arm64/kernel/asm-offsets.c | 2 +- arch/arm64/kernel/entry.S | 4 ++-- arch/arm64/kernel/process.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index 55f57dfa8e2f..fbb5c99eb2f9 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -132,7 +132,7 @@ static inline void local_daif_inherit(struct pt_regs *regs) trace_hardirqs_on(); if (system_uses_irq_prio_masking()) - gic_write_pmr(regs->pmr_save); + gic_write_pmr(regs->pmr); /* * We can't use local_daif_restore(regs->pstate) here as diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 1438424f0064..03b99164f7f4 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -293,7 +293,7 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) regs->pc = pc; if (system_uses_irq_prio_masking()) - regs->pmr_save = GIC_PRIO_IRQON; + regs->pmr = GIC_PRIO_IRQON; } static inline void start_thread(struct pt_regs *regs, unsigned long pc, diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 1ae671fe64d8..e82e6e47ac9a 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -167,7 +167,7 @@ struct pt_regs { u64 sdei_ttbr1; /* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */ - u64 pmr_save; + u64 pmr; u64 stackframe[2]; /* Only valid for some EL1 exceptions. */ @@ -211,7 +211,7 @@ static inline void forget_syscall(struct pt_regs *regs) #define irqs_priority_unmasked(regs) \ (system_uses_irq_prio_masking() ? \ - (regs)->pmr_save == GIC_PRIO_IRQON : \ + (regs)->pmr == GIC_PRIO_IRQON : \ true) #define interrupts_enabled(regs) \ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 27de1dddb0ab..eb7fb2f9b927 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -79,7 +79,7 @@ int main(void) DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); - DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); + DEFINE(S_PMR, offsetof(struct pt_regs, pmr)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 7ef0e127b149..a84ce95ad96c 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -315,7 +315,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING alternative_else_nop_endif mrs_s x20, SYS_ICC_PMR_EL1 - str x20, [sp, #S_PMR_SAVE] + str x20, [sp, #S_PMR] mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 @@ -342,7 +342,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING b .Lskip_pmr_restore\@ alternative_else_nop_endif - ldr x20, [sp, #S_PMR_SAVE] + ldr x20, [sp, #S_PMR] msr_s SYS_ICC_PMR_EL1, x20 /* Ensure priority change is seen by redistributor */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 0540653fbf38..2951b5ba1937 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -227,7 +227,7 @@ void __show_regs(struct pt_regs *regs) printk("sp : %016llx\n", sp); if (system_uses_irq_prio_masking()) - printk("pmr_save: %08llx\n", regs->pmr_save); + printk("pmr: %08llx\n", regs->pmr); i = top_reg; From 1454363098a0f7f14479f6a45945bf8d3d775a95 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:32 +0100 Subject: [PATCH 069/168] arm64: pt_regs: swap 'unused' and 'pmr' fields In subsequent patches we'll want to add an additional u64 to struct pt_regs. To make space, this patch swaps the 'unused' and 'pmr' fields, as the 'pmr' value only requires bits[7:0] and can safely fit into a u32, which frees up a 64-bit unused field. The 'lockdep_hardirqs' and 'exit_rcu' fields should eventually be moved out of pt_regs and managed locally within entry-common.c, so I've left those as-is for the moment. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-5-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 6 +++--- arch/arm64/kernel/entry.S | 4 ++-- arch/arm64/kernel/process.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index e82e6e47ac9a..92531aeba531 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -163,11 +163,11 @@ struct pt_regs { }; u64 orig_x0; s32 syscallno; - u32 unused; + u32 pmr; u64 sdei_ttbr1; - /* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */ - u64 pmr; + u64 unused; + u64 stackframe[2]; /* Only valid for some EL1 exceptions. */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a84ce95ad96c..fa6d6d5ca5e0 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -315,7 +315,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING alternative_else_nop_endif mrs_s x20, SYS_ICC_PMR_EL1 - str x20, [sp, #S_PMR] + str w20, [sp, #S_PMR] mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET msr_s SYS_ICC_PMR_EL1, x20 @@ -342,7 +342,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING b .Lskip_pmr_restore\@ alternative_else_nop_endif - ldr x20, [sp, #S_PMR] + ldr w20, [sp, #S_PMR] msr_s SYS_ICC_PMR_EL1, x20 /* Ensure priority change is seen by redistributor */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 2951b5ba1937..c722c1be6fa5 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -227,7 +227,7 @@ void __show_regs(struct pt_regs *regs) printk("sp : %016llx\n", sp); if (system_uses_irq_prio_masking()) - printk("pmr: %08llx\n", regs->pmr); + printk("pmr: %08x\n", regs->pmr); i = top_reg; From 886c2b0ba820b9d6ffe3a7c670eb2f519755123c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:33 +0100 Subject: [PATCH 070/168] arm64: use a common struct frame_record Currently the signal handling code has its own struct frame_record, the definition of struct pt_regs open-codes a frame record as an array, and the kernel unwinder hard-codes frame record offsets. Move to a common struct frame_record that can be used throughout the kernel. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-6-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 4 +++- arch/arm64/include/asm/stacktrace/common.h | 8 +++++--- arch/arm64/include/asm/stacktrace/frame.h | 13 +++++++++++++ arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/signal.c | 5 ----- arch/arm64/kernel/stacktrace.c | 2 +- 6 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 arch/arm64/include/asm/stacktrace/frame.h diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 92531aeba531..89c02f85f4b1 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -98,6 +98,8 @@ #include #include +#include + /* sizeof(struct user) for AArch32 */ #define COMPAT_USER_SZ 296 @@ -168,7 +170,7 @@ struct pt_regs { u64 sdei_ttbr1; u64 unused; - u64 stackframe[2]; + struct frame_record stackframe; /* Only valid for some EL1 exceptions. */ u64 lockdep_hardirqs; diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index f63dc654e545..7fab6876e497 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -137,21 +137,23 @@ found: static inline int unwind_next_frame_record(struct unwind_state *state) { + struct frame_record *record; unsigned long fp = state->fp; int err; if (fp & 0x7) return -EINVAL; - err = unwind_consume_stack(state, fp, 16); + err = unwind_consume_stack(state, fp, sizeof(*record)); if (err) return err; /* * Record this frame record's values. */ - state->fp = READ_ONCE(*(unsigned long *)(fp)); - state->pc = READ_ONCE(*(unsigned long *)(fp + 8)); + record = (struct frame_record *)fp; + state->fp = READ_ONCE(record->fp); + state->pc = READ_ONCE(record->lr); return 0; } diff --git a/arch/arm64/include/asm/stacktrace/frame.h b/arch/arm64/include/asm/stacktrace/frame.h new file mode 100644 index 000000000000..6397bc847f14 --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/frame.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_STACKTRACE_FRAME_H +#define __ASM_STACKTRACE_FRAME_H + +/* + * A standard AAPCS64 frame record. + */ +struct frame_record { + u64 fp; + u64 lr; +}; + +#endif /* __ASM_STACKTRACE_FRAME_H */ diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c722c1be6fa5..d45fd114eac3 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -419,7 +419,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * For the benefit of the unwinder, set up childregs->stackframe * as the final frame for the new task. */ - p->thread.cpu_context.fp = (unsigned long)childregs->stackframe; + p->thread.cpu_context.fp = (unsigned long)&childregs->stackframe; ptrace_hw_copy_thread(p); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 561986947530..2c47f9a0e40b 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -42,11 +42,6 @@ struct rt_sigframe { struct ucontext uc; }; -struct frame_record { - u64 fp; - u64 lr; -}; - struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 2729faaee4b4..ffe8e4f54956 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -145,7 +145,7 @@ kunwind_next(struct kunwind_state *state) int err; /* Final frame; nothing to unwind */ - if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) + if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe) return -ENOENT; err = unwind_next_frame_record(&state->common); From b7794795c93d9c15e4bc64db2f3bf2104451e3bc Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:34 +0100 Subject: [PATCH 071/168] arm64: stacktrace: move dump_backtrace() to kunwind_stack_walk() Currently dump_backtrace() can only print the PC value at each step of the unwind, as this is all the information that arch_stack_walk() passes to the dump_backtrace_entry() callback. In future we'd like to print some additional information, such as the origin of entries (e.g. PC, LR, FP) and/or the reliability thereof. In preparation for doing so, this patch moves dump_backtrace() over to kunwind_stack_walk(), which passes the full kunwind_state to the callback. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-7-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ffe8e4f54956..22f8709f0e76 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -294,10 +294,10 @@ noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u6 kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); } -static bool dump_backtrace_entry(void *arg, unsigned long where) +static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) { char *loglvl = arg; - printk("%s %pSb\n", loglvl, (void *)where); + printk("%s %pSb\n", loglvl, (void *)state->common.pc); return true; } @@ -316,7 +316,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, return; printk("%sCall trace:\n", loglvl); - arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); + kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs); put_task_stack(tsk); } From bdf8eafbf7f56f9fa43a019cdc1a5f057210f01d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:35 +0100 Subject: [PATCH 072/168] arm64: stacktrace: report source of unwind data When analysing a stacktrace it can be useful to know where an unwound PC came from, as in some situations certain sources may be suspect or known to be unreliable. In future it would also be useful to track this so that certain unwind steps can be performed in a stateful manner. For example when unwinding across an exception boundary, we'd ideally unwind pt_regs::pc, then pt_regs::lr, then the next frame record. This patch adds an enumerated set of unwind sources, tracks this during the unwind, and updates dump_backtrace() to log these for interesting unwind steps. The interesting sources recorded are: "C" - the PC came from the caller of an unwind function. "T" - the PC came from thread_saved_pc() for a blocked task. "P" - the PC came from a pt_regs::pc. "U" - the PC came from an unknown source (indicates an unwinder error). ... with nothing recorded when the PC came from a frame_record::pc as this is the vastly common case and logging this would make it difficult to spot the more interesting cases. For example, when triggering a backtrace via magic-sysrq + L, the CPU handling the sysrq will have a backtrace whose first element is the caller (C) of dump_backtrace(): | Call trace: | show_stack+0x18/0x30 (C) | dump_stack_lvl+0x60/0x80 | dump_stack+0x18/0x24 | nmi_cpu_backtrace+0xfc/0x140 | ... ... and other CPUs will have a backtrace whose first element is their pt_regs::pc (P) at the instant the backtrace IPI was taken: | Call trace: | _raw_spin_unlock_irqrestore+0x8/0x50 (P) | wake_up_process+0x18/0x24 | process_timeout+0x14/0x20 | call_timer_fn.isra.0+0x24/0x80 | ... Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-8-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 47 +++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 22f8709f0e76..83ab37e13159 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -20,6 +20,14 @@ #include #include +enum kunwind_source { + KUNWIND_SOURCE_UNKNOWN, + KUNWIND_SOURCE_FRAME, + KUNWIND_SOURCE_CALLER, + KUNWIND_SOURCE_TASK, + KUNWIND_SOURCE_REGS_PC, +}; + /* * Kernel unwind state * @@ -37,6 +45,7 @@ struct kunwind_state { #ifdef CONFIG_KRETPROBES struct llist_node *kr_cur; #endif + enum kunwind_source source; }; static __always_inline void @@ -45,6 +54,7 @@ kunwind_init(struct kunwind_state *state, { unwind_init_common(&state->common); state->task = task; + state->source = KUNWIND_SOURCE_UNKNOWN; } /* @@ -62,6 +72,7 @@ kunwind_init_from_regs(struct kunwind_state *state, state->common.fp = regs->regs[29]; state->common.pc = regs->pc; + state->source = KUNWIND_SOURCE_REGS_PC; } /* @@ -79,6 +90,7 @@ kunwind_init_from_caller(struct kunwind_state *state) state->common.fp = (unsigned long)__builtin_frame_address(1); state->common.pc = (unsigned long)__builtin_return_address(0); + state->source = KUNWIND_SOURCE_CALLER; } /* @@ -99,6 +111,7 @@ kunwind_init_from_task(struct kunwind_state *state, state->common.fp = thread_saved_fp(task); state->common.pc = thread_saved_pc(task); + state->source = KUNWIND_SOURCE_TASK; } static __always_inline int @@ -148,9 +161,19 @@ kunwind_next(struct kunwind_state *state) if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe) return -ENOENT; - err = unwind_next_frame_record(&state->common); - if (err) - return err; + switch (state->source) { + case KUNWIND_SOURCE_FRAME: + case KUNWIND_SOURCE_CALLER: + case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_PC: + err = unwind_next_frame_record(&state->common); + if (err) + return err; + state->source = KUNWIND_SOURCE_FRAME; + break; + default: + return -EINVAL; + } state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc); @@ -294,10 +317,26 @@ noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u6 kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL); } +static const char *state_source_string(const struct kunwind_state *state) +{ + switch (state->source) { + case KUNWIND_SOURCE_FRAME: return NULL; + case KUNWIND_SOURCE_CALLER: return "C"; + case KUNWIND_SOURCE_TASK: return "T"; + case KUNWIND_SOURCE_REGS_PC: return "P"; + default: return "U"; + } +} + static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) { + const char *source = state_source_string(state); char *loglvl = arg; - printk("%s %pSb\n", loglvl, (void *)state->common.pc); + printk("%s %pSb%s%s%s\n", loglvl, + (void *)state->common.pc, + source ? " (" : "", + source ? source : "", + source ? ")" : ""); return true; } From 8094df1cf09248e60afd0e14fb6c2ba4c79b0b9c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:36 +0100 Subject: [PATCH 073/168] arm64: stacktrace: report recovered PCs When analysing a stacktrace it can be useful to know whether an unwound PC has been rewritten by fgraph or kretprobes, as in some situations these may be suspect or be known to be unreliable. This patch adds flags to track when an unwind entry has recovered the PC from fgraph and/or kretprobes, and updates dump_backtrace() to log when this is the case. The flags recorded are: "F" - the PC was recovered from fgraph "K" - the PC was recovered from kretprobes These flags are recorded and logged in addition to the original source of the unwound PC. For example, with the ftrace_graph profiler enabled globally, and kretprobes installed on generic_handle_domain_irq() and do_interrupt_handler(), a backtrace triggered by magic-sysrq + L reports: | Call trace: | show_stack+0x20/0x40 (CF) | dump_stack_lvl+0x60/0x80 (F) | dump_stack+0x18/0x28 | nmi_cpu_backtrace+0xfc/0x140 | nmi_trigger_cpumask_backtrace+0x1c8/0x200 | arch_trigger_cpumask_backtrace+0x20/0x40 | sysrq_handle_showallcpus+0x24/0x38 (F) | __handle_sysrq+0xa8/0x1b0 (F) | handle_sysrq+0x38/0x50 (F) | pl011_int+0x460/0x5a8 (F) | __handle_irq_event_percpu+0x60/0x220 (F) | handle_irq_event+0x54/0xc0 (F) | handle_fasteoi_irq+0xa8/0x1d0 (F) | generic_handle_domain_irq+0x34/0x58 (F) | gic_handle_irq+0x54/0x140 (FK) | call_on_irq_stack+0x24/0x58 (F) | do_interrupt_handler+0x88/0xa0 | el1_interrupt+0x34/0x68 (FK) | el1h_64_irq_handler+0x18/0x28 | el1h_64_irq+0x64/0x68 | default_idle_call+0x34/0x180 | do_idle+0x204/0x268 | cpu_startup_entry+0x40/0x50 (F) | rest_init+0xe4/0xf0 | start_kernel+0x744/0x750 | __primary_switched+0x80/0x90 Note that as these flags are reported next to the recovered PC value, they appear on the callers of instrumented functions. For example gic_handle_irq() has a "K" marker because generic_handle_domain_irq() was instrumented with kretprobes and had its return address rewritten. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-9-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 83ab37e13159..f8e231683dad 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -28,6 +28,14 @@ enum kunwind_source { KUNWIND_SOURCE_REGS_PC, }; +union unwind_flags { + unsigned long all; + struct { + unsigned long fgraph : 1, + kretprobe : 1; + }; +}; + /* * Kernel unwind state * @@ -46,6 +54,7 @@ struct kunwind_state { struct llist_node *kr_cur; #endif enum kunwind_source source; + union unwind_flags flags; }; static __always_inline void @@ -55,6 +64,7 @@ kunwind_init(struct kunwind_state *state, unwind_init_common(&state->common); state->task = task; state->source = KUNWIND_SOURCE_UNKNOWN; + state->flags.all = 0; } /* @@ -127,6 +137,7 @@ kunwind_recover_return_address(struct kunwind_state *state) if (WARN_ON_ONCE(state->common.pc == orig_pc)) return -EINVAL; state->common.pc = orig_pc; + state->flags.fgraph = 1; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -137,6 +148,7 @@ kunwind_recover_return_address(struct kunwind_state *state) (void *)state->common.fp, &state->kr_cur); state->common.pc = orig_pc; + state->flags.kretprobe = 1; } #endif /* CONFIG_KRETPROBES */ @@ -157,6 +169,8 @@ kunwind_next(struct kunwind_state *state) unsigned long fp = state->common.fp; int err; + state->flags.all = 0; + /* Final frame; nothing to unwind */ if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe) return -ENOENT; @@ -331,12 +345,18 @@ static const char *state_source_string(const struct kunwind_state *state) static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg) { const char *source = state_source_string(state); + union unwind_flags flags = state->flags; + bool has_info = source || flags.all; char *loglvl = arg; - printk("%s %pSb%s%s%s\n", loglvl, + + printk("%s %pSb%s%s%s%s%s\n", loglvl, (void *)state->common.pc, - source ? " (" : "", + has_info ? " (" : "", source ? source : "", - source ? ")" : ""); + flags.fgraph ? "F" : "", + flags.kretprobe ? "K" : "", + has_info ? ")" : ""); + return true; } From f05a4a42de9031a819334a90b353ac48fd94f3a4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:37 +0100 Subject: [PATCH 074/168] arm64: stacktrace: split unwind_consume_stack() When unwinding stacks, we use unwind_consume_stack() to both find whether an object (e.g. a frame record) is on an accessible stack *and* to update the stack boundaries. This works fine today since we only care about one type of object which does not overlap other objects. In subsequent patches we'll want to check whether an object (e.g a frame record) is on the stack and follow this up by accessing a larger object containing the first (e.g. a pt_regs with an embedded frame record). To make that pattern easier to implement, this patch reworks unwind_find_next_stack() and unwind_consume_stack() so that the former can be used to check if an object is on any accessible stack, and the latter is purely used to update the stack boundaries. As unwind_find_next_stack() is modified to also check the stack currently being unwound, it is renamed to unwind_find_stack(). There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-10-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/stacktrace/common.h | 68 +++++++++++++--------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index 7fab6876e497..821a8fdd31af 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -60,13 +60,27 @@ static inline void unwind_init_common(struct unwind_state *state) state->stack = stackinfo_get_unknown(); } -static struct stack_info *unwind_find_next_stack(const struct unwind_state *state, - unsigned long sp, - unsigned long size) +/** + * unwind_find_stack() - Find the accessible stack which entirely contains an + * object. + * + * @state: the current unwind state. + * @sp: the base address of the object. + * @size: the size of the object. + * + * Return: a pointer to the relevant stack_info if found; NULL otherwise. + */ +static struct stack_info *unwind_find_stack(struct unwind_state *state, + unsigned long sp, + unsigned long size) { - for (int i = 0; i < state->nr_stacks; i++) { - struct stack_info *info = &state->stacks[i]; + struct stack_info *info = &state->stack; + if (stackinfo_on_stack(info, sp, size)) + return info; + + for (int i = 0; i < state->nr_stacks; i++) { + info = &state->stacks[i]; if (stackinfo_on_stack(info, sp, size)) return info; } @@ -75,36 +89,31 @@ static struct stack_info *unwind_find_next_stack(const struct unwind_state *stat } /** - * unwind_consume_stack() - Check if an object is on an accessible stack, - * updating stack boundaries so that future unwind steps cannot consume this - * object again. + * unwind_consume_stack() - Update stack boundaries so that future unwind steps + * cannot consume this object again. * * @state: the current unwind state. + * @info: the stack_info of the stack containing the object. * @sp: the base address of the object. * @size: the size of the object. * * Return: 0 upon success, an error code otherwise. */ -static inline int unwind_consume_stack(struct unwind_state *state, - unsigned long sp, - unsigned long size) +static inline void unwind_consume_stack(struct unwind_state *state, + struct stack_info *info, + unsigned long sp, + unsigned long size) { - struct stack_info *next; - - if (stackinfo_on_stack(&state->stack, sp, size)) - goto found; - - next = unwind_find_next_stack(state, sp, size); - if (!next) - return -EINVAL; + struct stack_info tmp; /* * Stack transitions are strictly one-way, and once we've * transitioned from one stack to another, it's never valid to * unwind back to the old stack. * - * Remove the current stack from the list of stacks so that it cannot - * be found on a subsequent transition. + * Destroy the old stack info so that it cannot be found upon a + * subsequent transition. If the stack has not changed, we'll + * immediately restore the current stack info. * * Note that stacks can nest in several valid orders, e.g. * @@ -115,16 +124,15 @@ static inline int unwind_consume_stack(struct unwind_state *state, * ... so we do not check the specific order of stack * transitions. */ - state->stack = *next; - *next = stackinfo_get_unknown(); + tmp = *info; + *info = stackinfo_get_unknown(); + state->stack = tmp; -found: /* * Future unwind steps can only consume stack above this frame record. * Update the current stack to start immediately above it. */ state->stack.low = sp + size; - return 0; } /** @@ -137,16 +145,18 @@ found: static inline int unwind_next_frame_record(struct unwind_state *state) { + struct stack_info *info; struct frame_record *record; unsigned long fp = state->fp; - int err; if (fp & 0x7) return -EINVAL; - err = unwind_consume_stack(state, fp, sizeof(*record)); - if (err) - return err; + info = unwind_find_stack(state, fp, sizeof(*record)); + if (!info) + return -EINVAL; + + unwind_consume_stack(state, info, fp, sizeof(*record)); /* * Record this frame record's values. From c2c6b27b5aa14fa28e3f455f697ccd2e0e75d773 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 17 Oct 2024 10:25:38 +0100 Subject: [PATCH 075/168] arm64: stacktrace: unwind exception boundaries When arm64's stack unwinder encounters an exception boundary, it uses the pt_regs::stackframe created by the entry code, which has a copy of the PC and FP at the time the exception was taken. The unwinder doesn't know anything about pt_regs, and reports the PC from the stackframe, but does not report the LR. The LR is only guaranteed to contain the return address at function call boundaries, and can be used as a scratch register at other times, so the LR at an exception boundary may or may not be a legitimate return address. It would be useful to report the LR value regardless, as it can be helpful when debugging, and in future it will be helpful for reliable stacktrace support. This patch changes the way we unwind across exception boundaries, allowing both the PC and LR to be reported. The entry code creates a frame_record_meta structure embedded within pt_regs, which the unwinder uses to find the pt_regs. The unwinder can then extract pt_regs::pc and pt_regs::lr as two separate unwind steps before continuing with a regular walk of frame records. When a PC is unwound from pt_regs::lr, dump_backtrace() will log this with an "L" marker so that it can be identified easily. For example, an unwind across an exception boundary will appear as follows: | el1h_64_irq+0x6c/0x70 | _raw_spin_unlock_irqrestore+0x10/0x60 (P) | __aarch64_insn_write+0x6c/0x90 (L) | aarch64_insn_patch_text_nosync+0x28/0x80 ... with a (P) entry for pt_regs::pc, and an (L) entry for pt_regs:lr. Note that the LR may be stale at the point of the exception, for example, shortly after a return: | el1h_64_irq+0x6c/0x70 | default_idle_call+0x34/0x180 (P) | default_idle_call+0x28/0x180 (L) | do_idle+0x204/0x268 ... where the LR points a few instructions before the current PC. This plays nicely with all the other unwind metadata tracking. With the ftrace_graph profiler enabled globally, and kretprobes installed on generic_handle_domain_irq() and do_interrupt_handler(), a backtrace triggered by magic-sysrq + L reports: | Call trace: | show_stack+0x20/0x40 (CF) | dump_stack_lvl+0x60/0x80 (F) | dump_stack+0x18/0x28 | nmi_cpu_backtrace+0xfc/0x140 | nmi_trigger_cpumask_backtrace+0x1c8/0x200 | arch_trigger_cpumask_backtrace+0x20/0x40 | sysrq_handle_showallcpus+0x24/0x38 (F) | __handle_sysrq+0xa8/0x1b0 (F) | handle_sysrq+0x38/0x50 (F) | pl011_int+0x460/0x5a8 (F) | __handle_irq_event_percpu+0x60/0x220 (F) | handle_irq_event+0x54/0xc0 (F) | handle_fasteoi_irq+0xa8/0x1d0 (F) | generic_handle_domain_irq+0x34/0x58 (F) | gic_handle_irq+0x54/0x140 (FK) | call_on_irq_stack+0x24/0x58 (F) | do_interrupt_handler+0x88/0xa0 | el1_interrupt+0x34/0x68 (FK) | el1h_64_irq_handler+0x18/0x28 | el1h_64_irq+0x6c/0x70 | default_idle_call+0x34/0x180 (P) | default_idle_call+0x28/0x180 (L) | do_idle+0x204/0x268 | cpu_startup_entry+0x3c/0x50 (F) | rest_init+0xe4/0xf0 | start_kernel+0x744/0x750 | __primary_switched+0x88/0x98 Signed-off-by: Mark Rutland Reviewed-by: Mark Brown Reviewed-by: Miroslav Benes Reviewed-by: Puranjay Mohan Cc: Ard Biesheuvel Cc: Josh Poimboeuf Cc: Kalesh Singh Cc: Madhavan T. Venkataraman Cc: Marc Zyngier Cc: Will Deacon Link: https://lore.kernel.org/r/20241017092538.1859841-11-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 4 +- arch/arm64/include/asm/stacktrace/frame.h | 35 +++++++ arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 12 ++- arch/arm64/kernel/head.S | 3 + arch/arm64/kernel/process.c | 1 + arch/arm64/kernel/stacktrace.c | 121 ++++++++++++++++++++-- 7 files changed, 158 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 89c02f85f4b1..47ff8654c5ec 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -168,9 +168,7 @@ struct pt_regs { u32 pmr; u64 sdei_ttbr1; - u64 unused; - - struct frame_record stackframe; + struct frame_record_meta stackframe; /* Only valid for some EL1 exceptions. */ u64 lockdep_hardirqs; diff --git a/arch/arm64/include/asm/stacktrace/frame.h b/arch/arm64/include/asm/stacktrace/frame.h index 6397bc847f14..0ee0f6ba0fd8 100644 --- a/arch/arm64/include/asm/stacktrace/frame.h +++ b/arch/arm64/include/asm/stacktrace/frame.h @@ -3,6 +3,30 @@ #define __ASM_STACKTRACE_FRAME_H /* + * - FRAME_META_TYPE_NONE + * + * This value is reserved. + * + * - FRAME_META_TYPE_FINAL + * + * The record is the last entry on the stack. + * Unwinding should terminate successfully. + * + * - FRAME_META_TYPE_PT_REGS + * + * The record is embedded within a struct pt_regs, recording the registers at + * an arbitrary point in time. + * Unwinding should consume pt_regs::pc, followed by pt_regs::lr. + * + * Note: all other values are reserved and should result in unwinding + * terminating with an error. + */ +#define FRAME_META_TYPE_NONE 0 +#define FRAME_META_TYPE_FINAL 1 +#define FRAME_META_TYPE_PT_REGS 2 + +#ifndef __ASSEMBLY__ +/* * A standard AAPCS64 frame record. */ struct frame_record { @@ -10,4 +34,15 @@ struct frame_record { u64 lr; }; +/* + * A metadata frame record indicating a special unwind. + * The record::{fp,lr} fields must be zero to indicate the presence of + * metadata. + */ +struct frame_record_meta { + struct frame_record record; + u64 type; +}; +#endif /* __ASSEMBLY */ + #endif /* __ASM_STACKTRACE_FRAME_H */ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index eb7fb2f9b927..021f04f97fde 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -81,6 +81,7 @@ int main(void) DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); DEFINE(S_PMR, offsetof(struct pt_regs, pmr)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); + DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type)); DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index fa6d6d5ca5e0..5ae2a34b50bd 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -284,15 +285,16 @@ alternative_else_nop_endif stp lr, x21, [sp, #S_LR] /* - * For exceptions from EL0, create a final frame record. - * For exceptions from EL1, create a synthetic frame record so the - * interrupted code shows up in the backtrace. + * Create a metadata frame record. The unwinder will use this to + * identify and unwind exception boundaries. */ - .if \el == 0 stp xzr, xzr, [sp, #S_STACKFRAME] + .if \el == 0 + mov x0, #FRAME_META_TYPE_FINAL .else - stp x29, x22, [sp, #S_STACKFRAME] + mov x0, #FRAME_META_TYPE_PT_REGS .endif + str x0, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index cb68adcabe07..5ab1970ee543 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -199,6 +200,8 @@ SYM_CODE_END(preserve_boot_args) sub sp, sp, #PT_REGS_SIZE stp xzr, xzr, [sp, #S_STACKFRAME] + mov \tmp1, #FRAME_META_TYPE_FINAL + str \tmp1, [sp, #S_STACKFRAME_TYPE] add x29, sp, #S_STACKFRAME scs_load_current diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index d45fd114eac3..29904c829de2 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -409,6 +409,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; + childregs->stackframe.type = FRAME_META_TYPE_FINAL; p->thread.cpu_context.x19 = (unsigned long)args->fn; p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index f8e231683dad..caef85462acb 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -26,6 +26,7 @@ enum kunwind_source { KUNWIND_SOURCE_CALLER, KUNWIND_SOURCE_TASK, KUNWIND_SOURCE_REGS_PC, + KUNWIND_SOURCE_REGS_LR, }; union unwind_flags { @@ -55,6 +56,7 @@ struct kunwind_state { #endif enum kunwind_source source; union unwind_flags flags; + struct pt_regs *regs; }; static __always_inline void @@ -65,6 +67,7 @@ kunwind_init(struct kunwind_state *state, state->task = task; state->source = KUNWIND_SOURCE_UNKNOWN; state->flags.all = 0; + state->regs = NULL; } /* @@ -80,6 +83,7 @@ kunwind_init_from_regs(struct kunwind_state *state, { kunwind_init(state, current); + state->regs = regs; state->common.fp = regs->regs[29]; state->common.pc = regs->pc; state->source = KUNWIND_SOURCE_REGS_PC; @@ -155,6 +159,103 @@ kunwind_recover_return_address(struct kunwind_state *state) return 0; } +static __always_inline +int kunwind_next_regs_pc(struct kunwind_state *state) +{ + struct stack_info *info; + unsigned long fp = state->common.fp; + struct pt_regs *regs; + + regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp); + + info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs)); + if (!info) + return -EINVAL; + + unwind_consume_stack(&state->common, info, (unsigned long)regs, + sizeof(*regs)); + + state->regs = regs; + state->common.pc = regs->pc; + state->common.fp = regs->regs[29]; + state->source = KUNWIND_SOURCE_REGS_PC; + return 0; +} + +static __always_inline int +kunwind_next_regs_lr(struct kunwind_state *state) +{ + /* + * The stack for the regs was consumed by kunwind_next_regs_pc(), so we + * cannot consume that again here, but we know the regs are safe to + * access. + */ + state->common.pc = state->regs->regs[30]; + state->common.fp = state->regs->regs[29]; + state->regs = NULL; + state->source = KUNWIND_SOURCE_REGS_LR; + + return 0; +} + +static __always_inline int +kunwind_next_frame_record_meta(struct kunwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->common.fp; + struct frame_record_meta *meta; + struct stack_info *info; + + info = unwind_find_stack(&state->common, fp, sizeof(*meta)); + if (!info) + return -EINVAL; + + meta = (struct frame_record_meta *)fp; + switch (READ_ONCE(meta->type)) { + case FRAME_META_TYPE_FINAL: + if (meta == &task_pt_regs(tsk)->stackframe) + return -ENOENT; + WARN_ON_ONCE(1); + return -EINVAL; + case FRAME_META_TYPE_PT_REGS: + return kunwind_next_regs_pc(state); + default: + WARN_ON_ONCE(1); + return -EINVAL; + } +} + +static __always_inline int +kunwind_next_frame_record(struct kunwind_state *state) +{ + unsigned long fp = state->common.fp; + struct frame_record *record; + struct stack_info *info; + unsigned long new_fp, new_pc; + + if (fp & 0x7) + return -EINVAL; + + info = unwind_find_stack(&state->common, fp, sizeof(*record)); + if (!info) + return -EINVAL; + + record = (struct frame_record *)fp; + new_fp = READ_ONCE(record->fp); + new_pc = READ_ONCE(record->lr); + + if (!new_fp && !new_pc) + return kunwind_next_frame_record_meta(state); + + unwind_consume_stack(&state->common, info, fp, sizeof(*record)); + + state->common.fp = new_fp; + state->common.pc = new_pc; + state->source = KUNWIND_SOURCE_FRAME; + + return 0; +} + /* * Unwind from one frame record (A) to the next frame record (B). * @@ -165,30 +266,27 @@ kunwind_recover_return_address(struct kunwind_state *state) static __always_inline int kunwind_next(struct kunwind_state *state) { - struct task_struct *tsk = state->task; - unsigned long fp = state->common.fp; int err; state->flags.all = 0; - /* Final frame; nothing to unwind */ - if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe) - return -ENOENT; - switch (state->source) { case KUNWIND_SOURCE_FRAME: case KUNWIND_SOURCE_CALLER: case KUNWIND_SOURCE_TASK: + case KUNWIND_SOURCE_REGS_LR: + err = kunwind_next_frame_record(state); + break; case KUNWIND_SOURCE_REGS_PC: - err = unwind_next_frame_record(&state->common); - if (err) - return err; - state->source = KUNWIND_SOURCE_FRAME; + err = kunwind_next_regs_lr(state); break; default: - return -EINVAL; + err = -EINVAL; } + if (err) + return err; + state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc); return kunwind_recover_return_address(state); @@ -338,6 +436,7 @@ static const char *state_source_string(const struct kunwind_state *state) case KUNWIND_SOURCE_CALLER: return "C"; case KUNWIND_SOURCE_TASK: return "T"; case KUNWIND_SOURCE_REGS_PC: return "P"; + case KUNWIND_SOURCE_REGS_LR: return "L"; default: return "U"; } } From 4e6e8c2b757f382684abc4765202cd25c221dea1 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 4 Oct 2024 21:26:29 +0100 Subject: [PATCH 076/168] binfmt_elf: Wire up AT_HWCAP3 at AT_HWCAP4 AT_HWCAP3 and AT_HWCAP4 were recently defined for use on PowerPC in commit 3281366a8e79 ("uapi/auxvec: Define AT_HWCAP3 and AT_HWCAP4 aux vector, entries"). Since we want to start using AT_HWCAP3 on arm64 add support for exposing both these new hwcaps via binfmt_elf. Signed-off-by: Mark Brown Acked-by: Kees Cook Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241004-arm64-elf-hwcap3-v2-1-799d1daad8b0@kernel.org Signed-off-by: Catalin Marinas --- fs/binfmt_elf.c | 6 ++++++ fs/binfmt_elf_fdpic.c | 6 ++++++ fs/compat_binfmt_elf.c | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 06dc4a57ba78..3039a6b7aba4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -257,6 +257,12 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif +#ifdef ELF_HWCAP3 + NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); +#endif +#ifdef ELF_HWCAP4 + NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); #endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 4fe5bb9f1b1f..31d253bd3961 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -623,6 +623,12 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); +#endif +#ifdef ELF_HWCAP3 + NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); +#endif +#ifdef ELF_HWCAP4 + NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); #endif NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 8f0af4f62631..d5ef5469e4e6 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -80,6 +80,16 @@ #define ELF_HWCAP2 COMPAT_ELF_HWCAP2 #endif +#ifdef COMPAT_ELF_HWCAP3 +#undef ELF_HWCAP3 +#define ELF_HWCAP3 COMPAT_ELF_HWCAP3 +#endif + +#ifdef COMPAT_ELF_HWCAP4 +#undef ELF_HWCAP4 +#define ELF_HWCAP4 COMPAT_ELF_HWCAP4 +#endif + #ifdef COMPAT_ARCH_DLINFO #undef ARCH_DLINFO #define ARCH_DLINFO COMPAT_ARCH_DLINFO From ddadbcdaaed5c3c44cc6c36093f6bf02d942d71d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 4 Oct 2024 21:26:30 +0100 Subject: [PATCH 077/168] arm64: Support AT_HWCAP3 We have filled all 64 bits of AT_HWCAP2 so in order to support discovery of further features provide the framework to use the already defined AT_HWCAP3 for further CPU features. Signed-off-by: Mark Brown Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241004-arm64-elf-hwcap3-v2-2-799d1daad8b0@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/elf_hwcaps.rst | 6 +++--- arch/arm64/include/asm/cpufeature.h | 3 ++- arch/arm64/include/asm/hwcap.h | 6 +++++- arch/arm64/include/uapi/asm/hwcap.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 6 ++++++ 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst index 694f67fa07d1..dc1b11d6d112 100644 --- a/Documentation/arch/arm64/elf_hwcaps.rst +++ b/Documentation/arch/arm64/elf_hwcaps.rst @@ -16,9 +16,9 @@ architected discovery mechanism available to userspace code at EL0. The kernel exposes the presence of these features to userspace through a set of flags called hwcaps, exposed in the auxiliary vector. -Userspace software can test for features by acquiring the AT_HWCAP or -AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant -flags are set, e.g.:: +Userspace software can test for features by acquiring the AT_HWCAP, +AT_HWCAP2 or AT_HWCAP3 entry of the auxiliary vector, and testing +whether the relevant flags are set, e.g.:: bool floating_point_is_present(void) { diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d261cc123c1..38e7d1a44ea3 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -12,7 +12,7 @@ #include #include -#define MAX_CPU_FEATURES 128 +#define MAX_CPU_FEATURES 192 #define cpu_feature(x) KERNEL_HWCAP_ ## x #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0 @@ -438,6 +438,7 @@ void cpu_set_feature(unsigned int num); bool cpu_have_feature(unsigned int num); unsigned long cpu_get_elf_hwcap(void); unsigned long cpu_get_elf_hwcap2(void); +unsigned long cpu_get_elf_hwcap3(void); #define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name)) #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name)) diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index a775adddecf2..3b5c50df419e 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -159,17 +159,21 @@ #define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2) #define KERNEL_HWCAP_POE __khwcap2_feature(POE) +#define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128) + /* * This yields a mask that user programs can use to figure out what * instruction set this cpu supports. */ #define ELF_HWCAP cpu_get_elf_hwcap() #define ELF_HWCAP2 cpu_get_elf_hwcap2() +#define ELF_HWCAP3 cpu_get_elf_hwcap3() #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP (compat_elf_hwcap) #define COMPAT_ELF_HWCAP2 (compat_elf_hwcap2) -extern unsigned int compat_elf_hwcap, compat_elf_hwcap2; +#define COMPAT_ELF_HWCAP3 (compat_elf_hwcap3) +extern unsigned int compat_elf_hwcap, compat_elf_hwcap2, compat_elf_hwcap3; #endif enum { diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 055381b2c615..3dd4a53a438a 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -124,4 +124,8 @@ #define HWCAP2_SME_SF8DP2 (1UL << 62) #define HWCAP2_POE (1UL << 63) +/* + * HWCAP3 flags - for AT_HWCAP3 + */ + #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..7221636b7907 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -103,6 +103,7 @@ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly; COMPAT_HWCAP_LPAE) unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT; unsigned int compat_elf_hwcap2 __read_mostly; +unsigned int compat_elf_hwcap3 __read_mostly; #endif DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS); @@ -3499,6 +3500,11 @@ unsigned long cpu_get_elf_hwcap2(void) return elf_hwcap[1]; } +unsigned long cpu_get_elf_hwcap3(void) +{ + return elf_hwcap[2]; +} + static void __init setup_boot_cpu_capabilities(void) { /* From a2aa5dcc6393dc08844a3f76aa5b7694fbbf99c8 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 16:32:44 +0100 Subject: [PATCH 078/168] kselftest/arm64: signal: drop now redundant GNU_SOURCE definition The definition of GNU_SOURCE was recently centralised in an upper layer kselftest Makefile, so the definition in the arm64 signal tests Makefile is no longer needed. To make things worse, since both definitions are not strictly identical, the compiler warns about it: : warning: "_GNU_SOURCE" redefined : note: this is the location of the previous definition Drop the definition in the arm64/signal Makefile. Fixes: cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to CFLAGS in lib.mk") Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240816153251.2833702-2-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/signal/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile index edb3613513b8..1381039fb36f 100644 --- a/tools/testing/selftests/arm64/signal/Makefile +++ b/tools/testing/selftests/arm64/signal/Makefile @@ -2,7 +2,7 @@ # Copyright (C) 2019 ARM Limited # Additional include paths needed by kselftest.h and local headers -CFLAGS += -D_GNU_SOURCE -std=gnu99 -I. +CFLAGS += -std=gnu99 -I. SRCS := $(filter-out testcases/testcases.c,$(wildcard testcases/*.c)) PROGS := $(patsubst %.c,%,$(SRCS)) From b0d80dbc378d52155c9ecf9579986edccceed3aa Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 16:32:45 +0100 Subject: [PATCH 079/168] kselftest/arm64: hwcap: fix f8dp2 cpuinfo name The F8DP2 DPISA extension has a separate cpuinfo field, named accordingly. Change the erroneously placed name of "f8dp4" to "f8dp2". Fixes: 44d10c27bd75 ("kselftest/arm64: Add 2023 DPISA hwcap test coverage") Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240816153251.2833702-3-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/abi/hwcap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index f2d6007a2b98..7e95ba5fd496 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -490,7 +490,7 @@ static const struct hwcap_data { .name = "F8DP2", .at_hwcap = AT_HWCAP2, .hwcap_bit = HWCAP2_F8DP2, - .cpuinfo = "f8dp4", + .cpuinfo = "f8dp2", .sigill_fn = f8dp2_sigill, }, { From bf52ca5912c07664276c7b94db820fa2d638b681 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 16:32:46 +0100 Subject: [PATCH 080/168] kselftest/arm64: mte: use proper SKIP syntax If MTE is not available on a system, we detect this early and skip all the MTE selftests. However this happens before we print the TAP plan, so tools parsing the TAP output get confused and report an error. Use the existing ksft_exit_skip() function to handle this, which uses a dummy plan to work with tools expecting proper TAP syntax, as described in the TAP specification. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240816153251.2833702-4-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/mte_common_util.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index 00ffd34c66d3..69e4a67853c4 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -319,10 +319,9 @@ int mte_default_setup(void) unsigned long en = 0; int ret; - if (!(hwcaps2 & HWCAP2_MTE)) { - ksft_print_msg("SKIP: MTE features unavailable\n"); - return KSFT_SKIP; - } + if (!(hwcaps2 & HWCAP2_MTE)) + ksft_exit_skip("MTE features unavailable\n"); + /* Get current mte mode */ ret = prctl(PR_GET_TAGGED_ADDR_CTRL, en, 0, 0, 0); if (ret < 0) { From 0f995f22a03fef8f3bff51d22a0a78c768536814 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 16:32:47 +0100 Subject: [PATCH 081/168] kselftest/arm64: mte: use string literal for printf-style functions Using pointers for the format specifier strings in printf-style functions can create potential security problems, as the number of arguments to be parsed could vary from call to call. Most compilers consequently warn about those: "format not a string literal and no format arguments [-Wformat-security]" If we only want to print a constant string, we can just use a fixed "%s" format instead, and pass the string as an argument. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240816153251.2833702-5-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/mte_common_util.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.h b/tools/testing/selftests/arm64/mte/mte_common_util.h index 2d3e71724e55..a0017a303beb 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.h +++ b/tools/testing/selftests/arm64/mte/mte_common_util.h @@ -77,13 +77,13 @@ static inline void evaluate_test(int err, const char *msg) { switch (err) { case KSFT_PASS: - ksft_test_result_pass(msg); + ksft_test_result_pass("%s", msg); break; case KSFT_FAIL: - ksft_test_result_fail(msg); + ksft_test_result_fail("%s", msg); break; case KSFT_SKIP: - ksft_test_result_skip(msg); + ksft_test_result_skip("%s", msg); break; default: ksft_test_result_error("Unknown return code %d from %s", From 7e893dc81de3e342156389ea0b83ec7d07f25281 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 16:32:49 +0100 Subject: [PATCH 082/168] kselftest/arm64: mte: fix printf type warnings about __u64 When printing the signal context's PC, we use a "%lx" format specifier, which matches the common userland (glibc's) definition of uint64_t as an "unsigned long". However the structure in question is defined in a kernel uapi header, which uses a self defined __u64 type, and the arm64 kernel headers define this using "int-ll64.h", so it becomes an "unsigned long long". This mismatch leads to the usual compiler warning. The common fix would be to use "PRIx64", but because this is defined by the userland's toolchain libc headers, it wouldn't match as well. Since we know the exact type of __u64, just use "%llx" here instead, to silence this warning. This also fixes a more severe typo: "$lx" is not a valid format specifier. Fixes: 191e678bdc9b ("kselftest/arm64: Log unexpected asynchronous MTE faults") Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240816153251.2833702-7-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/mte_common_util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index 69e4a67853c4..9380edca29c7 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -38,7 +38,7 @@ void mte_default_handler(int signum, siginfo_t *si, void *uc) if (cur_mte_cxt.trig_si_code == si->si_code) cur_mte_cxt.fault_valid = true; else - ksft_print_msg("Got unexpected SEGV_MTEAERR at pc=$lx, fault addr=%lx\n", + ksft_print_msg("Got unexpected SEGV_MTEAERR at pc=%llx, fault addr=%lx\n", ((ucontext_t *)uc)->uc_mcontext.pc, addr); return; @@ -64,7 +64,7 @@ void mte_default_handler(int signum, siginfo_t *si, void *uc) exit(1); } } else if (signum == SIGBUS) { - ksft_print_msg("INFO: SIGBUS signal at pc=%lx, fault addr=%lx, si_code=%lx\n", + ksft_print_msg("INFO: SIGBUS signal at pc=%llx, fault addr=%lx, si_code=%x\n", ((ucontext_t *)uc)->uc_mcontext.pc, addr, si->si_code); if ((cur_mte_cxt.trig_range >= 0 && addr >= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) && From 4716f719202e900b52f5f2270ac16b6a8ae40a47 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 16:32:50 +0100 Subject: [PATCH 083/168] kselftest/arm64: mte: fix printf type warnings about pointers When printing the value of a pointer, we should not use an integer format specifier, but the dedicated "%p" instead. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240816153251.2833702-8-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/check_buffer_fill.c | 4 ++-- tools/testing/selftests/arm64/mte/mte_common_util.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/mte/check_buffer_fill.c b/tools/testing/selftests/arm64/mte/check_buffer_fill.c index 1dbbbd47dd50..2ee7f114d7fa 100644 --- a/tools/testing/selftests/arm64/mte/check_buffer_fill.c +++ b/tools/testing/selftests/arm64/mte/check_buffer_fill.c @@ -91,7 +91,7 @@ static int check_buffer_underflow_by_byte(int mem_type, int mode, for (j = 0; j < sizes[i]; j++) { if (ptr[j] != '1') { err = true; - ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n", + ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%p\n", j, ptr); break; } @@ -189,7 +189,7 @@ static int check_buffer_overflow_by_byte(int mem_type, int mode, for (j = 0; j < sizes[i]; j++) { if (ptr[j] != '1') { err = true; - ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n", + ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%p\n", j, ptr); break; } diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index 9380edca29c7..17fbe5cfe472 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -100,7 +100,7 @@ void *mte_insert_tags(void *ptr, size_t size) int align_size; if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) { - ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr); + ksft_print_msg("FAIL: Addr=%p: invalid\n", ptr); return NULL; } align_size = MT_ALIGN_UP(size); @@ -112,7 +112,7 @@ void *mte_insert_tags(void *ptr, size_t size) void mte_clear_tags(void *ptr, size_t size) { if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) { - ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr); + ksft_print_msg("FAIL: Addr=%p: invalid\n", ptr); return; } size = MT_ALIGN_UP(size); From 96dddb7b9406259baace9a1831e8da155311be6f Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 16:32:51 +0100 Subject: [PATCH 084/168] kselftest/arm64: mte: fix printf type warnings about longs When checking MTE tags, we print some diagnostic messages when the tests fail. Some variables uses there are "longs", however we only use "%x" for the format specifier. Update the format specifiers to "%lx", to match the variable types they are supposed to print. Fixes: f3b2a26ca78d ("kselftest/arm64: Verify mte tag inclusion via prctl") Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20240816153251.2833702-9-andre.przywara@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/check_tags_inclusion.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c index 2b1425b92b69..a3d1e23fe02a 100644 --- a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c +++ b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c @@ -65,7 +65,7 @@ static int check_single_included_tags(int mem_type, int mode) ptr = mte_insert_tags(ptr, BUFFER_SIZE); /* Check tag value */ if (MT_FETCH_TAG((uintptr_t)ptr) == tag) { - ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n", + ksft_print_msg("FAIL: wrong tag = 0x%lx with include mask=0x%x\n", MT_FETCH_TAG((uintptr_t)ptr), MT_INCLUDE_VALID_TAG(tag)); result = KSFT_FAIL; @@ -97,7 +97,7 @@ static int check_multiple_included_tags(int mem_type, int mode) ptr = mte_insert_tags(ptr, BUFFER_SIZE); /* Check tag value */ if (MT_FETCH_TAG((uintptr_t)ptr) < tag) { - ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n", + ksft_print_msg("FAIL: wrong tag = 0x%lx with include mask=0x%lx\n", MT_FETCH_TAG((uintptr_t)ptr), MT_INCLUDE_VALID_TAGS(excl_mask)); result = KSFT_FAIL; From fc7454107d1b7c27bb98d3b109e5f44a8a46d7f8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Oct 2024 09:53:49 +0200 Subject: [PATCH 085/168] arm64/lib: Handle CRC-32 alternative in C code In preparation for adding another code path for performing CRC-32, move the alternative patching for ARM64_HAS_CRC32 into C code. The logic for deciding whether to use this new code path will be implemented in C too. Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241018075347.2821102-6-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/lib/Makefile | 2 +- arch/arm64/lib/crc32-glue.c | 34 ++++++++++++++++++++++++++++++++++ arch/arm64/lib/crc32.S | 22 ++++++---------------- 3 files changed, 41 insertions(+), 17 deletions(-) create mode 100644 arch/arm64/lib/crc32-glue.c diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 13e6a2829116..8e882f479d98 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -13,7 +13,7 @@ endif lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o -obj-$(CONFIG_CRC32) += crc32.o +obj-$(CONFIG_CRC32) += crc32.o crc32-glue.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c new file mode 100644 index 000000000000..0b51761d4b75 --- /dev/null +++ b/arch/arm64/lib/crc32-glue.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +#include + +asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); + +u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_le_base(crc, p, len); + + return crc32_le_arm64(crc, p, len); +} + +u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return __crc32c_le_base(crc, p, len); + + return crc32c_le_arm64(crc, p, len); +} + +u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) +{ + if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) + return crc32_be_base(crc, p, len); + + return crc32_be_arm64(crc, p, len); +} diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index 8340dccff46f..22139691c7ae 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -6,7 +6,6 @@ */ #include -#include #include .arch armv8-a+crc @@ -136,25 +135,16 @@ CPU_BE( rev16 \reg, \reg ) .endm .align 5 -SYM_FUNC_START(crc32_le) -alternative_if_not ARM64_HAS_CRC32 - b crc32_le_base -alternative_else_nop_endif +SYM_FUNC_START(crc32_le_arm64) __crc32 -SYM_FUNC_END(crc32_le) +SYM_FUNC_END(crc32_le_arm64) .align 5 -SYM_FUNC_START(__crc32c_le) -alternative_if_not ARM64_HAS_CRC32 - b __crc32c_le_base -alternative_else_nop_endif +SYM_FUNC_START(crc32c_le_arm64) __crc32 c -SYM_FUNC_END(__crc32c_le) +SYM_FUNC_END(crc32c_le_arm64) .align 5 -SYM_FUNC_START(crc32_be) -alternative_if_not ARM64_HAS_CRC32 - b crc32_be_base -alternative_else_nop_endif +SYM_FUNC_START(crc32_be_arm64) __crc32 be=1 -SYM_FUNC_END(crc32_be) +SYM_FUNC_END(crc32_be_arm64) From b98b23e19492f4009070761c53b755f623f60e49 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Oct 2024 09:53:50 +0200 Subject: [PATCH 086/168] arm64/crc32: Reorganize bit/byte ordering macros In preparation for a new user, reorganize the bit/byte ordering macros that are used to parameterize the crc32 template code and instantiate CRC-32, CRC-32c and 'big endian' CRC-32. Signed-off-by: Ard Biesheuvel Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20241018075347.2821102-7-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/lib/crc32.S | 93 ++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 53 deletions(-) diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index 22139691c7ae..f9920492f135 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -10,44 +10,48 @@ .arch armv8-a+crc - .macro byteorder, reg, be - .if \be -CPU_LE( rev \reg, \reg ) - .else -CPU_BE( rev \reg, \reg ) - .endif + .macro bitle, reg .endm - .macro byteorder16, reg, be - .if \be -CPU_LE( rev16 \reg, \reg ) - .else -CPU_BE( rev16 \reg, \reg ) - .endif - .endm - - .macro bitorder, reg, be - .if \be + .macro bitbe, reg rbit \reg, \reg - .endif .endm - .macro bitorder16, reg, be - .if \be - rbit \reg, \reg - lsr \reg, \reg, #16 - .endif + .macro bytele, reg .endm - .macro bitorder8, reg, be - .if \be + .macro bytebe, reg rbit \reg, \reg lsr \reg, \reg, #24 - .endif .endm - .macro __crc32, c, be=0 - bitorder w0, \be + .macro hwordle, reg +CPU_BE( rev16 \reg, \reg ) + .endm + + .macro hwordbe, reg +CPU_LE( rev \reg, \reg ) + rbit \reg, \reg +CPU_BE( lsr \reg, \reg, #16 ) + .endm + + .macro le, regs:vararg + .irp r, \regs +CPU_BE( rev \r, \r ) + .endr + .endm + + .macro be, regs:vararg + .irp r, \regs +CPU_LE( rev \r, \r ) + .endr + .irp r, \regs + rbit \r, \r + .endr + .endm + + .macro __crc32, c, order=le + bit\order w0 cmp x2, #16 b.lt 8f // less than 16 bytes @@ -60,14 +64,7 @@ CPU_BE( rev16 \reg, \reg ) add x8, x8, x1 add x1, x1, x7 ldp x5, x6, [x8] - byteorder x3, \be - byteorder x4, \be - byteorder x5, \be - byteorder x6, \be - bitorder x3, \be - bitorder x4, \be - bitorder x5, \be - bitorder x6, \be + \order x3, x4, x5, x6 tst x7, #8 crc32\c\()x w8, w0, x3 @@ -95,42 +92,32 @@ CPU_BE( rev16 \reg, \reg ) 32: ldp x3, x4, [x1], #32 sub x2, x2, #32 ldp x5, x6, [x1, #-16] - byteorder x3, \be - byteorder x4, \be - byteorder x5, \be - byteorder x6, \be - bitorder x3, \be - bitorder x4, \be - bitorder x5, \be - bitorder x6, \be + \order x3, x4, x5, x6 crc32\c\()x w0, w0, x3 crc32\c\()x w0, w0, x4 crc32\c\()x w0, w0, x5 crc32\c\()x w0, w0, x6 cbnz x2, 32b -0: bitorder w0, \be +0: bit\order w0 ret 8: tbz x2, #3, 4f ldr x3, [x1], #8 - byteorder x3, \be - bitorder x3, \be + \order x3 crc32\c\()x w0, w0, x3 4: tbz x2, #2, 2f ldr w3, [x1], #4 - byteorder w3, \be - bitorder w3, \be + \order w3 crc32\c\()w w0, w0, w3 2: tbz x2, #1, 1f ldrh w3, [x1], #2 - byteorder16 w3, \be - bitorder16 w3, \be + hword\order w3 crc32\c\()h w0, w0, w3 1: tbz x2, #0, 0f ldrb w3, [x1] - bitorder8 w3, \be + byte\order w3 crc32\c\()b w0, w0, w3 -0: bitorder w0, \be +0: bit\order w0 ret .endm @@ -146,5 +133,5 @@ SYM_FUNC_END(crc32c_le_arm64) .align 5 SYM_FUNC_START(crc32_be_arm64) - __crc32 be=1 + __crc32 order=be SYM_FUNC_END(crc32_be_arm64) From a6478d69cf56d5deb4c28a6486376d9c7895abec Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 18 Oct 2024 09:53:51 +0200 Subject: [PATCH 087/168] arm64/crc32: Implement 4-way interleave using PMULL Now that kernel mode NEON no longer disables preemption, using FP/SIMD in library code which is not obviously part of the crypto subsystem is no longer problematic, as it will no longer incur unexpected latencies. So accelerate the CRC-32 library code on arm64 to use a 4-way interleave, using PMULL instructions to implement the folding. On Apple M2, this results in a speedup of 2 - 2.8x when using input sizes of 1k - 8k. For smaller sizes, the overhead of preserving and restoring the FP/SIMD register file may not be worth it, so 1k is used as a threshold for choosing this code path. The coefficient tables were generated using code provided by Eric. [0] [0] https://github.com/ebiggers/libdeflate/blob/master/scripts/gen_crc32_multipliers.c Cc: Eric Biggers Signed-off-by: Ard Biesheuvel Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20241018075347.2821102-8-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/lib/crc32-glue.c | 48 ++++++++ arch/arm64/lib/crc32.S | 231 +++++++++++++++++++++++++++++++++++- 2 files changed, 276 insertions(+), 3 deletions(-) diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c index 0b51761d4b75..295ae3e6b997 100644 --- a/arch/arm64/lib/crc32-glue.c +++ b/arch/arm64/lib/crc32-glue.c @@ -4,16 +4,40 @@ #include #include +#include +#include +#include + +#include + +// The minimum input length to consider the 4-way interleaved code path +static const size_t min_len = 1024; asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len); asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len); +asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len); + u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len) { if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_le_base(crc, p, len); + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + return crc32_le_arm64(crc, p, len); } @@ -22,6 +46,18 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len) if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return __crc32c_le_base(crc, p, len); + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32c_le_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + return crc32c_le_arm64(crc, p, len); } @@ -30,5 +66,17 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) if (!alternative_has_cap_likely(ARM64_HAS_CRC32)) return crc32_be_base(crc, p, len); + if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) { + kernel_neon_begin(); + crc = crc32_be_arm64_4way(crc, p, len); + kernel_neon_end(); + + p += round_down(len, 64); + len %= 64; + + if (!len) + return crc; + } + return crc32_be_arm64(crc, p, len); } diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S index f9920492f135..68825317460f 100644 --- a/arch/arm64/lib/crc32.S +++ b/arch/arm64/lib/crc32.S @@ -1,14 +1,17 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Accelerated CRC32(C) using AArch64 CRC instructions + * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions * - * Copyright (C) 2016 - 2018 Linaro Ltd + * Copyright (C) 2016 - 2018 Linaro Ltd. + * Copyright (C) 2024 Google LLC + * + * Author: Ard Biesheuvel */ #include #include - .arch armv8-a+crc + .cpu generic+crc+crypto .macro bitle, reg .endm @@ -135,3 +138,225 @@ SYM_FUNC_END(crc32c_le_arm64) SYM_FUNC_START(crc32_be_arm64) __crc32 order=be SYM_FUNC_END(crc32_be_arm64) + + in .req x1 + len .req x2 + + /* + * w0: input CRC at entry, output CRC at exit + * x1: pointer to input buffer + * x2: length of input in bytes + */ + .macro crc4way, insn, table, order=le + bit\order w0 + lsr len, len, #6 // len := # of 64-byte blocks + + /* Process up to 64 blocks of 64 bytes at a time */ +.La\@: mov x3, #64 + cmp len, #64 + csel x3, x3, len, hi // x3 := min(len, 64) + sub len, len, x3 + + /* Divide the input into 4 contiguous blocks */ + add x4, x3, x3, lsl #1 // x4 := 3 * x3 + add x7, in, x3, lsl #4 // x7 := in + 16 * x3 + add x8, in, x3, lsl #5 // x8 := in + 32 * x3 + add x9, in, x4, lsl #4 // x9 := in + 16 * x4 + + /* Load the folding coefficients from the lookup table */ + adr_l x5, \table - 12 // entry 0 omitted + add x5, x5, x4, lsl #2 // x5 += 12 * x3 + ldp s0, s1, [x5] + ldr s2, [x5, #8] + + /* Zero init partial CRCs for this iteration */ + mov w4, wzr + mov w5, wzr + mov w6, wzr + mov x17, xzr + +.Lb\@: sub x3, x3, #1 + \insn w6, w6, x17 + ldp x10, x11, [in], #16 + ldp x12, x13, [x7], #16 + ldp x14, x15, [x8], #16 + ldp x16, x17, [x9], #16 + + \order x10, x11, x12, x13, x14, x15, x16, x17 + + /* Apply the CRC transform to 4 16-byte blocks in parallel */ + \insn w0, w0, x10 + \insn w4, w4, x12 + \insn w5, w5, x14 + \insn w6, w6, x16 + \insn w0, w0, x11 + \insn w4, w4, x13 + \insn w5, w5, x15 + cbnz x3, .Lb\@ + + /* Combine the 4 partial results into w0 */ + mov v3.d[0], x0 + mov v4.d[0], x4 + mov v5.d[0], x5 + pmull v0.1q, v0.1d, v3.1d + pmull v1.1q, v1.1d, v4.1d + pmull v2.1q, v2.1d, v5.1d + eor v0.8b, v0.8b, v1.8b + eor v0.8b, v0.8b, v2.8b + mov x5, v0.d[0] + eor x5, x5, x17 + \insn w0, w6, x5 + + mov in, x9 + cbnz len, .La\@ + + bit\order w0 + ret + .endm + + .align 5 +SYM_FUNC_START(crc32c_le_arm64_4way) + crc4way crc32cx, .L0 +SYM_FUNC_END(crc32c_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_le_arm64_4way) + crc4way crc32x, .L1 +SYM_FUNC_END(crc32_le_arm64_4way) + + .align 5 +SYM_FUNC_START(crc32_be_arm64_4way) + crc4way crc32x, .L1, be +SYM_FUNC_END(crc32_be_arm64_4way) + + .section .rodata, "a", %progbits + .align 6 +.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27 + .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e + .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b + .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8 + .long 0x299847d5, 0x878a92a7, 0x39d3b296 + .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53 + .long 0xa60ce07b, 0x83348832, 0x47db8317 + .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092 + .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0 + .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7 + .long 0xf285651c, 0xce7f39f4, 0xdaece73e + .long 0x271d9844, 0xd270f1a2, 0xab7aff2a + .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385 + .long 0xcec3662e, 0x1b03397f, 0x83348832 + .long 0x8227bb8a, 0xb3e32c28, 0x299847d5 + .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86 + .long 0xf6076544, 0x10746f3c, 0x18b33a4e + .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b + .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7 + .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b + .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b + .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4 + .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56 + .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2 + .long 0xa90fd27a, 0x0167d312, 0xc619809d + .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d + .long 0x4597456a, 0x98d8d9cb, 0x65863b64 + .long 0xc9c8b782, 0x68bce87a, 0x1b03397f + .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd + .long 0x2342001e, 0x3771e98f, 0xb3e32c28 + .long 0xe8b6368b, 0x2178513a, 0x064f7f26 + .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c + .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c + .long 0x02ee03b2, 0xff0dba97, 0x10746f3c + .long 0x135c83fd, 0xf872e54c, 0xc7a68855 + .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844 + .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c + .long 0xded288f8, 0xb3af077a, 0x93a5f730 + .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c + .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2 + .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203 + .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e + .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb + .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a + .long 0x8e1450f7, 0x2342001e, 0x8227bb8a + .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768 + .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35 + .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c + .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b + .long 0xd6c3a807, 0x2664fd8b, 0x0167d312 + .long 0x1d31175f, 0x02ee03b2, 0xf6076544 + .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a + .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf + .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb + .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c + .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a + .long 0x8a074012, 0xded288f8, 0x57a3d037 + .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b + .long 0x3be3c09b, 0x6353c1cc, 0x42d98888 + .long 0x465a4eee, 0xf48642e9, 0x3771e98f + .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9 + .long 0xa52f58ec, 0x9a5ede41, 0x2178513a + .long 0x47972100, 0x45cddf4e, 0xe0ac139e + .long 0x359674f7, 0xa51b6135, 0x170076fa + +.L1: .long 0xaf449247, 0x81256527, 0xccaa009e + .long 0x57c54819, 0x1d9513d7, 0x81256527 + .long 0x3f41287a, 0x57c54819, 0xaf449247 + .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7 + .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394 + .long 0x71d54a59, 0xf5e48c85, 0x57c54819 + .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed + .long 0xd31343ea, 0xe95c1271, 0x910eeec1 + .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a + .long 0x9ee62949, 0xcec97417, 0x9026d5b1 + .long 0xa55d1514, 0xf183c71b, 0xd1df2327 + .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85 + .long 0x9d842b80, 0xeea395c4, 0x3c656ced + .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd + .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd + .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271 + .long 0xef82aa68, 0xdb3935ea, 0xb918a347 + .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59 + .long 0x99cce860, 0x356d209f, 0xff6f2fc2 + .long 0xd8af8e46, 0xc352f6de, 0xcec97417 + .long 0xf1996890, 0xd8110ff1, 0x1c63267b + .long 0x631bc508, 0xe95c7216, 0xf183c71b + .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0 + .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea + .long 0x7a92fffb, 0xf7003835, 0x4470ac44 + .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4 + .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee + .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40 + .long 0x60290934, 0x81b6f443, 0x6d40f445 + .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949 + .long 0xdcf5088a, 0x9dbdc100, 0x145575d5 + .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d + .long 0x255b139e, 0x631bc508, 0xa55d1514 + .long 0xd784eaa8, 0xce26786c, 0xdb3935ea + .long 0x6d2c864a, 0x8068c345, 0x2586d334 + .long 0x02072e24, 0xdb3839f3, 0x21aa2b26 + .long 0x06689b0a, 0x5efd72f5, 0xe0575528 + .long 0x1e52f5ea, 0x4117915b, 0x356d209f + .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80 + .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de + .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c + .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1 + .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0 + .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216 + .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356 + .long 0x0c540e7b, 0x753c81ff, 0x8e031a19 + .long 0x9924c781, 0xb9220208, 0x3edcde65 + .long 0x3954de39, 0x1753ab84, 0x1d6708a0 + .long 0xf32238b5, 0xbec81497, 0x9e70b943 + .long 0xbbd2cd2c, 0x0925d861, 0xf7003835 + .long 0xcc401304, 0xd784eaa8, 0xef82aa68 + .long 0x4987e684, 0x6044fbb0, 0x00eba0c8 + .long 0x3aa11427, 0x18fe3b4a, 0x87441142 + .long 0x297aad60, 0x02072e24, 0xd14bcc9b + .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a + .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8 + .long 0x25b8822a, 0x1e52f5ea, 0x99cce860 + .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443 + .long 0x5690aa32, 0xa91fdefb, 0x688a110e + .long 0x1357a093, 0x3796455c, 0xd8af8e46 + .long 0x798fdd33, 0xaaa18a37, 0x357b9517 + .long 0xc2815395, 0x54d42691, 0x9dbdc100 + .long 0x21cfc0f7, 0x28ae0976, 0xf1996890 + .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6 From f260c442676310329b8b4482d3be59fe0e742220 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 21 Oct 2024 17:44:56 +0100 Subject: [PATCH 088/168] arm64: preserve pt_regs::stackframe during exec*() When performing an exec*(), there's a transient period before the return to userspace where any stacktrace will result in a warning triggered by kunwind_next_frame_record_meta() encountering a struct frame_record_meta with an unknown type. This can be seen fairly reliably by enabling KASAN or KFENCE, e.g. | WARNING: CPU: 3 PID: 143 at arch/arm64/kernel/stacktrace.c:223 arch_stack_walk+0x264/0x3b0 | Modules linked in: | CPU: 3 UID: 0 PID: 143 Comm: login Not tainted 6.12.0-rc2-00010-g0f0b9a3f6a50 #1 | Hardware name: linux,dummy-virt (DT) | pstate: 814000c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--) | pc : arch_stack_walk+0x264/0x3b0 | lr : arch_stack_walk+0x1ec/0x3b0 | sp : ffff80008060b970 | x29: ffff80008060ba10 x28: fff00000051133c0 x27: 0000000000000000 | x26: 0000000000000000 x25: 0000000000000000 x24: fff000007fe84000 | x23: ffff9d1b3c940af0 x22: 0000000000000000 x21: fff00000051133c0 | x20: ffff80008060ba50 x19: ffff9d1b3c9408e0 x18: 0000000000000014 | x17: 000000006d50da47 x16: 000000008e3f265e x15: fff0000004e8bf40 | x14: 0000ffffc5e50e48 x13: 000000000000000f x12: 0000ffffc5e50fed | x11: 000000000000001f x10: 000018007f8bffff x9 : 0000000000000000 | x8 : ffff80008060b9c0 x7 : ffff80008060bfd8 x6 : ffff80008060ba80 | x5 : ffff80008060ba00 x4 : ffff80008060c000 x3 : ffff80008060bff0 | x2 : 0000000000000018 x1 : ffff80008060bfd8 x0 : 0000000000000000 | Call trace: | arch_stack_walk+0x264/0x3b0 (P) | arch_stack_walk+0x1ec/0x3b0 (L) | stack_trace_save+0x50/0x80 | metadata_update_state+0x98/0xa0 | kfence_guarded_free+0xec/0x2c4 | __kfence_free+0x50/0x100 | kmem_cache_free+0x1a4/0x37c | putname+0x9c/0xc0 | do_execveat_common.isra.0+0xf0/0x1e4 | __arm64_sys_execve+0x40/0x60 | invoke_syscall+0x48/0x104 | el0_svc_common.constprop.0+0x40/0xe0 | do_el0_svc+0x1c/0x28 | el0_svc+0x34/0xe0 | el0t_64_sync_handler+0x120/0x12c | el0t_64_sync+0x198/0x19c This happens because start_thread_common() zeroes the entirety of current_pt_regs(), including pt_regs::stackframe::type, changing this from FRAME_META_TYPE_FINAL to 0 and making the final record invalid. The stacktrace code will reject this until the next return to userspace, where a subsequent exception entry will reinitialize the type to FRAME_META_TYPE_FINAL. This zeroing wasn't a problem prior to commit: c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries") ... as before that commit the stacktrace code only expected the final pt_regs::stackframe to contain zeroes, which was unchanged by start_thread_common(). A stacktrace could occur at any time, either due to instrumentation or an exception, and so start_thread_common() must ensure that pt_regs::stackframe is always valid. Fix this by changing the way start_thread_common() zeroes and reinitializes the pt_regs fields: * The '{regs,pc,pstate}' fields are initialized in one go via a struct assignment to the user_regs, with start_thread() and compat_start_thread() modified to pass 'pstate' into start_thread_common(). * The 'sp' and 'compat_sp' fields are zeroed by the struct assignment in start_thread_common(), and subsequently overwritten in start_thread() and compat_start_thread respectively, matching existing behaviour. * The 'syscallno' field is implicitly preserved while the 'orig_x0' field is explicitly zeroed, maintaining existing ABI. * The 'pmr' field is explicitly initialized, as necessary for an exec*() from a kernel thread, matching existing behaviour. * The 'stackframe' field is implicitly preserved, with a new comment and some assertions to ensure we don't accidentally break this in future. * All other fields are implicitly preserved, and should have no functional impact: - 'sdei_ttbr1' is only used for SDEI exception entry/exit, and we never exec*() inside an SDEI handler. - 'lockdep_hardirqs' and 'exit_rcu' are only used for EL1 exception entry/exit, and we never exec*() inside an EL1 exception handler. While updating compat_start_thread() to pass 'pstate' into start_thread_common(), I've also updated the logic to remove the ifdeffery, replacing: | #ifdef __AARCH64EB__ | regs->pstate |= PSR_AA32_E_BIT; | #endif ... with: | if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) | pstate |= PSR_AA32_E_BIT; ... which should be functionally equivalent, and matches our preferred code style. Fixes: c2c6b27b5aa1 ("arm64: stacktrace: unwind exception boundaries") Signed-off-by: Mark Rutland Cc: Mark Brown Cc: Miroslav Benes Cc: Puranjay Mohan Cc: Will Deacon Fixes: c2c6b27b5aa1 ("arm64: stacktrace: unwind exception boundaries") Tested-by: Puranjay Mohan Reviewed-by: Puranjay Mohan Link: https://lore.kernel.org/r/20241021164456.2275285-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/processor.h | 48 +++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 03b99164f7f4..e277105fb87a 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -285,22 +285,44 @@ void tls_preserve_current_state(void); .fpsimd_cpu = NR_CPUS, \ } -static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) +static inline void start_thread_common(struct pt_regs *regs, unsigned long pc, + unsigned long pstate) { - s32 previous_syscall = regs->syscallno; - memset(regs, 0, sizeof(*regs)); - regs->syscallno = previous_syscall; - regs->pc = pc; + /* + * Ensure all GPRs are zeroed, and initialize PC + PSTATE. + * The SP (or compat SP) will be initialized later. + */ + regs->user_regs = (struct user_pt_regs) { + .pc = pc, + .pstate = pstate, + }; + /* + * To allow the syscalls:sys_exit_execve tracepoint we need to preserve + * syscallno, but do not need orig_x0 or the original GPRs. + */ + regs->orig_x0 = 0; + + /* + * An exec from a kernel thread won't have an existing PMR value. + */ if (system_uses_irq_prio_masking()) regs->pmr = GIC_PRIO_IRQON; + + /* + * The pt_regs::stackframe field must remain valid throughout this + * function as a stacktrace can be taken at any time. Any user or + * kernel task should have a valid final frame. + */ + WARN_ON_ONCE(regs->stackframe.record.fp != 0); + WARN_ON_ONCE(regs->stackframe.record.lr != 0); + WARN_ON_ONCE(regs->stackframe.type != FRAME_META_TYPE_FINAL); } static inline void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { - start_thread_common(regs, pc); - regs->pstate = PSR_MODE_EL0t; + start_thread_common(regs, pc, PSR_MODE_EL0t); spectre_v4_enable_task_mitigation(current); regs->sp = sp; } @@ -309,15 +331,13 @@ static inline void start_thread(struct pt_regs *regs, unsigned long pc, static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { - start_thread_common(regs, pc); - regs->pstate = PSR_AA32_MODE_USR; + unsigned long pstate = PSR_AA32_MODE_USR; if (pc & 1) - regs->pstate |= PSR_AA32_T_BIT; - -#ifdef __AARCH64EB__ - regs->pstate |= PSR_AA32_E_BIT; -#endif + pstate |= PSR_AA32_T_BIT; + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) + pstate |= PSR_AA32_E_BIT; + start_thread_common(regs, pc, pstate); spectre_v4_enable_task_mitigation(current); regs->compat_sp = sp; } From 7a08cb9b4bb92fb86f5fe8a3aa0ac08a9b3d783b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 17 Oct 2024 18:43:31 +0100 Subject: [PATCH 089/168] kselftest/arm64: Fail the overall fp-stress test if any test fails Currently fp-stress does not report a top level test result if it runs to completion, it always exits with a return code 0. Use the ksft_finished() helper to ensure that the exit code for the top level program reports a failure if any of the individual tests has failed. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241017-arm64-fp-stress-exit-code-v1-1-f528e53a2321@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fp-stress.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index faac24bdefeb..e62c9dbad501 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -651,7 +651,5 @@ int main(int argc, char **argv) drain_output(true); - ksft_print_cnts(); - - return 0; + ksft_finished(); } From b880a80011f56880f32bde47fc6af313359f926b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:24 +0100 Subject: [PATCH 090/168] arm64: rsi: Add RSI definitions The RMM (Realm Management Monitor) provides functionality that can be accessed by a realm guest through SMC (Realm Services Interface) calls. The SMC definitions are based on DEN0137[1] version 1.0-rel0. [1] https://developer.arm.com/documentation/den0137/1-0rel0/ Acked-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-2-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/rsi_cmds.h | 139 +++++++++++++++++++++ arch/arm64/include/asm/rsi_smc.h | 193 ++++++++++++++++++++++++++++++ 2 files changed, 332 insertions(+) create mode 100644 arch/arm64/include/asm/rsi_cmds.h create mode 100644 arch/arm64/include/asm/rsi_smc.h diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h new file mode 100644 index 000000000000..2fcf351b5634 --- /dev/null +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_RSI_CMDS_H +#define __ASM_RSI_CMDS_H + +#include + +#include + +#define RSI_GRANULE_SHIFT 12 +#define RSI_GRANULE_SIZE (_AC(1, UL) << RSI_GRANULE_SHIFT) + +enum ripas { + RSI_RIPAS_EMPTY = 0, + RSI_RIPAS_RAM = 1, + RSI_RIPAS_DESTROYED = 2, + RSI_RIPAS_DEV = 3, +}; + +static inline unsigned long rsi_request_version(unsigned long req, + unsigned long *out_lower, + unsigned long *out_higher) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_ABI_VERSION, req, 0, 0, 0, 0, 0, 0, &res); + + if (out_lower) + *out_lower = res.a1; + if (out_higher) + *out_higher = res.a2; + + return res.a0; +} + +static inline unsigned long rsi_get_realm_config(struct realm_config *cfg) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_REALM_CONFIG, virt_to_phys(cfg), + 0, 0, 0, 0, 0, 0, &res); + return res.a0; +} + +static inline long rsi_set_addr_range_state(phys_addr_t start, + phys_addr_t end, + enum ripas state, + unsigned long flags, + phys_addr_t *top) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_IPA_STATE_SET, start, end, state, + flags, 0, 0, 0, &res); + + if (top) + *top = res.a1; + + if (res.a2 != RSI_ACCEPT) + return -EPERM; + + return res.a0; +} + +/** + * rsi_attestation_token_init - Initialise the operation to retrieve an + * attestation token. + * + * @challenge: The challenge data to be used in the attestation token + * generation. + * @size: Size of the challenge data in bytes. + * + * Initialises the attestation token generation and returns an upper bound + * on the attestation token size that can be used to allocate an adequate + * buffer. The caller is expected to subsequently call + * rsi_attestation_token_continue() to retrieve the attestation token data on + * the same CPU. + * + * Returns: + * On success, returns the upper limit of the attestation report size. + * Otherwise, -EINVAL + */ +static inline long +rsi_attestation_token_init(const u8 *challenge, unsigned long size) +{ + struct arm_smccc_1_2_regs regs = { 0 }; + + /* The challenge must be at least 32bytes and at most 64bytes */ + if (!challenge || size < 32 || size > 64) + return -EINVAL; + + regs.a0 = SMC_RSI_ATTESTATION_TOKEN_INIT; + memcpy(®s.a1, challenge, size); + arm_smccc_1_2_smc(®s, ®s); + + if (regs.a0 == RSI_SUCCESS) + return regs.a1; + + return -EINVAL; +} + +/** + * rsi_attestation_token_continue - Continue the operation to retrieve an + * attestation token. + * + * @granule: {I}PA of the Granule to which the token will be written. + * @offset: Offset within Granule to start of buffer in bytes. + * @size: The size of the buffer. + * @len: The number of bytes written to the buffer. + * + * Retrieves up to a RSI_GRANULE_SIZE worth of token data per call. The caller + * is expected to call rsi_attestation_token_init() before calling this + * function to retrieve the attestation token. + * + * Return: + * * %RSI_SUCCESS - Attestation token retrieved successfully. + * * %RSI_INCOMPLETE - Token generation is not complete. + * * %RSI_ERROR_INPUT - A parameter was not valid. + * * %RSI_ERROR_STATE - Attestation not in progress. + */ +static inline unsigned long rsi_attestation_token_continue(phys_addr_t granule, + unsigned long offset, + unsigned long size, + unsigned long *len) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(SMC_RSI_ATTESTATION_TOKEN_CONTINUE, + granule, offset, size, 0, &res); + + if (len) + *len = res.a1; + return res.a0; +} + +#endif /* __ASM_RSI_CMDS_H */ diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h new file mode 100644 index 000000000000..6cb070eca9e9 --- /dev/null +++ b/arch/arm64/include/asm/rsi_smc.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#ifndef __ASM_RSI_SMC_H_ +#define __ASM_RSI_SMC_H_ + +#include + +/* + * This file describes the Realm Services Interface (RSI) Application Binary + * Interface (ABI) for SMC calls made from within the Realm to the RMM and + * serviced by the RMM. + */ + +/* + * The major version number of the RSI implementation. This is increased when + * the binary format or semantics of the SMC calls change. + */ +#define RSI_ABI_VERSION_MAJOR UL(1) + +/* + * The minor version number of the RSI implementation. This is increased when + * a bug is fixed, or a feature is added without breaking binary compatibility. + */ +#define RSI_ABI_VERSION_MINOR UL(0) + +#define RSI_ABI_VERSION ((RSI_ABI_VERSION_MAJOR << 16) | \ + RSI_ABI_VERSION_MINOR) + +#define RSI_ABI_VERSION_GET_MAJOR(_version) ((_version) >> 16) +#define RSI_ABI_VERSION_GET_MINOR(_version) ((_version) & 0xFFFF) + +#define RSI_SUCCESS UL(0) +#define RSI_ERROR_INPUT UL(1) +#define RSI_ERROR_STATE UL(2) +#define RSI_INCOMPLETE UL(3) +#define RSI_ERROR_UNKNOWN UL(4) + +#define SMC_RSI_FID(n) ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD, \ + n) + +/* + * Returns RSI version. + * + * arg1 == Requested interface revision + * ret0 == Status / error + * ret1 == Lower implemented interface revision + * ret2 == Higher implemented interface revision + */ +#define SMC_RSI_ABI_VERSION SMC_RSI_FID(0x190) + +/* + * Read feature register. + * + * arg1 == Feature register index + * ret0 == Status / error + * ret1 == Feature register value + */ +#define SMC_RSI_FEATURES SMC_RSI_FID(0x191) + +/* + * Read measurement for the current Realm. + * + * arg1 == Index, which measurements slot to read + * ret0 == Status / error + * ret1 == Measurement value, bytes: 0 - 7 + * ret2 == Measurement value, bytes: 8 - 15 + * ret3 == Measurement value, bytes: 16 - 23 + * ret4 == Measurement value, bytes: 24 - 31 + * ret5 == Measurement value, bytes: 32 - 39 + * ret6 == Measurement value, bytes: 40 - 47 + * ret7 == Measurement value, bytes: 48 - 55 + * ret8 == Measurement value, bytes: 56 - 63 + */ +#define SMC_RSI_MEASUREMENT_READ SMC_RSI_FID(0x192) + +/* + * Extend Realm Extensible Measurement (REM) value. + * + * arg1 == Index, which measurements slot to extend + * arg2 == Size of realm measurement in bytes, max 64 bytes + * arg3 == Measurement value, bytes: 0 - 7 + * arg4 == Measurement value, bytes: 8 - 15 + * arg5 == Measurement value, bytes: 16 - 23 + * arg6 == Measurement value, bytes: 24 - 31 + * arg7 == Measurement value, bytes: 32 - 39 + * arg8 == Measurement value, bytes: 40 - 47 + * arg9 == Measurement value, bytes: 48 - 55 + * arg10 == Measurement value, bytes: 56 - 63 + * ret0 == Status / error + */ +#define SMC_RSI_MEASUREMENT_EXTEND SMC_RSI_FID(0x193) + +/* + * Initialize the operation to retrieve an attestation token. + * + * arg1 == Challenge value, bytes: 0 - 7 + * arg2 == Challenge value, bytes: 8 - 15 + * arg3 == Challenge value, bytes: 16 - 23 + * arg4 == Challenge value, bytes: 24 - 31 + * arg5 == Challenge value, bytes: 32 - 39 + * arg6 == Challenge value, bytes: 40 - 47 + * arg7 == Challenge value, bytes: 48 - 55 + * arg8 == Challenge value, bytes: 56 - 63 + * ret0 == Status / error + * ret1 == Upper bound of token size in bytes + */ +#define SMC_RSI_ATTESTATION_TOKEN_INIT SMC_RSI_FID(0x194) + +/* + * Continue the operation to retrieve an attestation token. + * + * arg1 == The IPA of token buffer + * arg2 == Offset within the granule of the token buffer + * arg3 == Size of the granule buffer + * ret0 == Status / error + * ret1 == Length of token bytes copied to the granule buffer + */ +#define SMC_RSI_ATTESTATION_TOKEN_CONTINUE SMC_RSI_FID(0x195) + +#ifndef __ASSEMBLY__ + +struct realm_config { + union { + struct { + unsigned long ipa_bits; /* Width of IPA in bits */ + unsigned long hash_algo; /* Hash algorithm */ + }; + u8 pad[0x200]; + }; + union { + u8 rpv[64]; /* Realm Personalization Value */ + u8 pad2[0xe00]; + }; + /* + * The RMM requires the configuration structure to be aligned to a 4k + * boundary, ensure this happens by aligning this structure. + */ +} __aligned(0x1000); + +#endif /* __ASSEMBLY__ */ + +/* + * Read configuration for the current Realm. + * + * arg1 == struct realm_config addr + * ret0 == Status / error + */ +#define SMC_RSI_REALM_CONFIG SMC_RSI_FID(0x196) + +/* + * Request RIPAS of a target IPA range to be changed to a specified value. + * + * arg1 == Base IPA address of target region + * arg2 == Top of the region + * arg3 == RIPAS value + * arg4 == flags + * ret0 == Status / error + * ret1 == Top of modified IPA range + * ret2 == Whether the Host accepted or rejected the request + */ +#define SMC_RSI_IPA_STATE_SET SMC_RSI_FID(0x197) + +#define RSI_NO_CHANGE_DESTROYED UL(0) +#define RSI_CHANGE_DESTROYED UL(1) + +#define RSI_ACCEPT UL(0) +#define RSI_REJECT UL(1) + +/* + * Get RIPAS of a target IPA range. + * + * arg1 == Base IPA of target region + * arg2 == End of target IPA region + * ret0 == Status / error + * ret1 == Top of IPA region which has the reported RIPAS value + * ret2 == RIPAS value + */ +#define SMC_RSI_IPA_STATE_GET SMC_RSI_FID(0x198) + +/* + * Make a Host call. + * + * arg1 == IPA of host call structure + * ret0 == Status / error + */ +#define SMC_RSI_HOST_CALL SMC_RSI_FID(0x199) + +#endif /* __ASM_RSI_SMC_H_ */ From c077711f718be7cebcc8b987eac2ebfd17447e9f Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:25 +0100 Subject: [PATCH 091/168] arm64: Detect if in a realm and set RIPAS RAM Detect that the VM is a realm guest by the presence of the RSI interface. This is done after PSCI has been initialised so that we can check the SMCCC conduit before making any RSI calls. If in a realm then iterate over all memory ensuring that it is marked as RIPAS RAM. The loader is required to do this for us, however if some memory is missed this will cause the guest to receive a hard to debug external abort at some random point in the future. So for a belt-and-braces approach set all memory to RIPAS RAM. Any failure here implies that the RAM regions passed to Linux are incorrect so panic() promptly to make the situation clear. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Co-developed-by: Steven Price Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-3-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/rsi.h | 66 +++++++++++++++++++++++++++++++ arch/arm64/kernel/Makefile | 3 +- arch/arm64/kernel/rsi.c | 76 ++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/setup.c | 3 ++ 4 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/include/asm/rsi.h create mode 100644 arch/arm64/kernel/rsi.c diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h new file mode 100644 index 000000000000..acba065eb00e --- /dev/null +++ b/arch/arm64/include/asm/rsi.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 ARM Ltd. + */ + +#ifndef __ASM_RSI_H_ +#define __ASM_RSI_H_ + +#include +#include +#include + +DECLARE_STATIC_KEY_FALSE(rsi_present); + +void __init arm64_rsi_init(void); + +static inline bool is_realm_world(void) +{ + return static_branch_unlikely(&rsi_present); +} + +static inline int rsi_set_memory_range(phys_addr_t start, phys_addr_t end, + enum ripas state, unsigned long flags) +{ + unsigned long ret; + phys_addr_t top; + + while (start != end) { + ret = rsi_set_addr_range_state(start, end, state, flags, &top); + if (ret || top < start || top > end) + return -EINVAL; + start = top; + } + + return 0; +} + +/* + * Convert the specified range to RAM. Do not use this if you rely on the + * contents of a page that may already be in RAM state. + */ +static inline int rsi_set_memory_range_protected(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_RAM, + RSI_CHANGE_DESTROYED); +} + +/* + * Convert the specified range to RAM. Do not convert any pages that may have + * been DESTROYED, without our permission. + */ +static inline int rsi_set_memory_range_protected_safe(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_RAM, + RSI_NO_CHANGE_DESTROYED); +} + +static inline int rsi_set_memory_range_shared(phys_addr_t start, + phys_addr_t end) +{ + return rsi_set_memory_range(start, end, RSI_RIPAS_EMPTY, + RSI_CHANGE_DESTROYED); +} +#endif /* __ASM_RSI_H_ */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 2b112f3b7510..71c29a2a2f19 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -33,7 +33,8 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o proton-pack.o idle.o patching.o pi/ + syscall.o proton-pack.o idle.o patching.o pi/ \ + rsi.o obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ sys_compat.o diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c new file mode 100644 index 000000000000..c5758317dfed --- /dev/null +++ b/arch/arm64/kernel/rsi.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE_RO(rsi_present); +EXPORT_SYMBOL(rsi_present); + +static bool rsi_version_matches(void) +{ + unsigned long ver_lower, ver_higher; + unsigned long ret = rsi_request_version(RSI_ABI_VERSION, + &ver_lower, + &ver_higher); + + if (ret == SMCCC_RET_NOT_SUPPORTED) + return false; + + if (ret != RSI_SUCCESS) { + pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n", + RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR, + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower), + RSI_ABI_VERSION_GET_MAJOR(ver_higher), + RSI_ABI_VERSION_GET_MINOR(ver_higher)); + return false; + } + + pr_info("RME: Using RSI version %lu.%lu\n", + RSI_ABI_VERSION_GET_MAJOR(ver_lower), + RSI_ABI_VERSION_GET_MINOR(ver_lower)); + + return true; +} + +static void __init arm64_rsi_setup_memory(void) +{ + u64 i; + phys_addr_t start, end; + + /* + * Iterate over the available memory ranges and convert the state to + * protected memory. We should take extra care to ensure that we DO NOT + * permit any "DESTROYED" pages to be converted to "RAM". + * + * panic() is used because if the attempt to switch the memory to + * protected has failed here, then future accesses to the memory are + * simply going to be reflected as a SEA (Synchronous External Abort) + * which we can't handle. Bailing out early prevents the guest limping + * on and dying later. + */ + for_each_mem_range(i, &start, &end) { + if (rsi_set_memory_range_protected_safe(start, end)) { + panic("Failed to set memory range to protected: %pa-%pa", + &start, &end); + } + } +} + +void __init arm64_rsi_init(void) +{ + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) + return; + if (!rsi_version_matches()) + return; + + arm64_rsi_setup_memory(); + + static_branch_enable(&rsi_present); +} + diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index b22d28ec8028..b5e1e306fa51 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -351,6 +352,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) else psci_acpi_init(); + arm64_rsi_init(); + init_bootcpu_ops(); smp_init_cpus(); smp_build_mpidr_hash(); From 399306954996be58ac20b4b29f6334e3d55a2ce7 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Thu, 17 Oct 2024 14:14:26 +0100 Subject: [PATCH 092/168] arm64: realm: Query IPA size from the RMM The top bit of the configured IPA size is used as an attribute to control whether the address is protected or shared. Query the configuration from the RMM to assertain which bit this is. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-4-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-prot.h | 4 ++++ arch/arm64/kernel/rsi.c | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2a11d0c10760..820a3b06f08c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -68,8 +68,12 @@ #include #include +#include extern bool arm64_use_ng_mappings; +extern unsigned long prot_ns_shared; + +#define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index c5758317dfed..cea8f0d39591 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -8,6 +8,11 @@ #include #include +static struct realm_config config; + +unsigned long prot_ns_shared; +EXPORT_SYMBOL(prot_ns_shared); + DEFINE_STATIC_KEY_FALSE_RO(rsi_present); EXPORT_SYMBOL(rsi_present); @@ -68,6 +73,9 @@ void __init arm64_rsi_init(void) return; if (!rsi_version_matches()) return; + if (WARN_ON(rsi_get_realm_config(&config))) + return; + prot_ns_shared = BIT(config.ipa_bits - 1); arm64_rsi_setup_memory(); From 371589437616fbb03590d8ff505f8a4c95c8a031 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:27 +0100 Subject: [PATCH 093/168] arm64: rsi: Add support for checking whether an MMIO is protected On Arm CCA, with RMM-v1.0, all MMIO regions are shared. However, in the future, an Arm CCA-v1.0 compliant guest may be run in a lesser privileged partition in the Realm World (with Arm CCA-v1.1 Planes feature). In this case, some of the MMIO regions may be emulated by a higher privileged component in the Realm world, i.e, protected. Thus the guest must decide today, whether a given MMIO region is shared vs Protected and create the stage1 mapping accordingly. On Arm CCA, this detection is based on the "IPA State" (RIPAS == RIPAS_IO). Provide a helper to run this check on a given range of MMIO. Also, provide a arm64 helper which may be hooked in by other solutions. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-5-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/io.h | 8 ++++++++ arch/arm64/include/asm/rsi.h | 2 ++ arch/arm64/include/asm/rsi_cmds.h | 21 +++++++++++++++++++++ arch/arm64/kernel/rsi.c | 26 ++++++++++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 1ada23a6ec19..8688343b71f2 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * Generic IO read/write. These perform native-endian accesses. @@ -318,4 +319,11 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, unsigned long flags); #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap +static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size) +{ + if (unlikely(is_realm_world())) + return __arm64_is_protected_mmio(phys_addr, size); + return false; +} + #endif /* __ASM_IO_H */ diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h index acba065eb00e..188cbb9b23f5 100644 --- a/arch/arm64/include/asm/rsi.h +++ b/arch/arm64/include/asm/rsi.h @@ -14,6 +14,8 @@ DECLARE_STATIC_KEY_FALSE(rsi_present); void __init arm64_rsi_init(void); +bool __arm64_is_protected_mmio(phys_addr_t base, size_t size); + static inline bool is_realm_world(void) { return static_branch_unlikely(&rsi_present); diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index 2fcf351b5634..e6a211001bd3 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -45,6 +45,27 @@ static inline unsigned long rsi_get_realm_config(struct realm_config *cfg) return res.a0; } +static inline unsigned long rsi_ipa_state_get(phys_addr_t start, + phys_addr_t end, + enum ripas *state, + phys_addr_t *top) +{ + struct arm_smccc_res res; + + arm_smccc_smc(SMC_RSI_IPA_STATE_GET, + start, end, 0, 0, 0, 0, 0, + &res); + + if (res.a0 == RSI_SUCCESS) { + if (top) + *top = res.a1; + if (state) + *state = res.a2; + } + + return res.a0; +} + static inline long rsi_set_addr_range_state(phys_addr_t start, phys_addr_t end, enum ripas state, diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index cea8f0d39591..7e7934c4fca0 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -67,6 +67,32 @@ static void __init arm64_rsi_setup_memory(void) } } +bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) +{ + enum ripas ripas; + phys_addr_t end, top; + + /* Overflow ? */ + if (WARN_ON(base + size <= base)) + return false; + + end = ALIGN(base + size, RSI_GRANULE_SIZE); + base = ALIGN_DOWN(base, RSI_GRANULE_SIZE); + + while (base < end) { + if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top))) + break; + if (WARN_ON(top <= base)) + break; + if (ripas != RSI_RIPAS_DEV) + break; + base = top; + } + + return base >= end; +} +EXPORT_SYMBOL(__arm64_is_protected_mmio); + void __init arm64_rsi_init(void) { if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) From 3c6c706139564f74ec48229378873c1d930a8bc8 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:28 +0100 Subject: [PATCH 094/168] arm64: rsi: Map unprotected MMIO as decrypted Instead of marking every MMIO as shared, check if the given region is "Protected" and apply the permissions accordingly. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-6-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/rsi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 7e7934c4fca0..3e0c83e2296f 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -6,6 +6,8 @@ #include #include #include + +#include #include static struct realm_config config; @@ -93,6 +95,16 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size) } EXPORT_SYMBOL(__arm64_is_protected_mmio); +static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot) +{ + if (__arm64_is_protected_mmio(phys, size)) + *prot = pgprot_encrypted(*prot); + else + *prot = pgprot_decrypted(*prot); + + return 0; +} + void __init arm64_rsi_init(void) { if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC) @@ -103,6 +115,9 @@ void __init arm64_rsi_init(void) return; prot_ns_shared = BIT(config.ipa_bits - 1); + if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) + return; + arm64_rsi_setup_memory(); static_branch_enable(&rsi_present); From 491db21d8256992ab9fe11c42744eb3044315d14 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:29 +0100 Subject: [PATCH 095/168] efi: arm64: Map Device with Prot Shared Device mappings need to be emulated by the VMM so must be mapped shared with the host. Reviewed-by: Gavin Shan Reviewed-by: Catalin Marinas Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-7-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/efi.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 712718aed5dd..1d25d8899dbf 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -34,8 +34,16 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md) u64 attr = md->attribute; u32 type = md->type; - if (type == EFI_MEMORY_MAPPED_IO) - return PROT_DEVICE_nGnRE; + if (type == EFI_MEMORY_MAPPED_IO) { + pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE); + + if (arm64_is_protected_mmio(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT)) + prot = pgprot_encrypted(prot); + else + prot = pgprot_decrypted(prot); + return pgprot_val(prot); + } if (region_is_misaligned(md)) { static bool __initdata code_is_misaligned; From fbf979a01375704fa87c559763209c658593b6f8 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Thu, 17 Oct 2024 14:14:30 +0100 Subject: [PATCH 096/168] arm64: Enforce bounce buffers for realm DMA Within a realm guest it's not possible for a device emulated by the VMM to access arbitrary guest memory. So force the use of bounce buffers to ensure that the memory the emulated devices are accessing is in memory which is explicitly shared with the host. This adds a call to swiotlb_update_mem_attributes() which calls set_memory_decrypted() to ensure the bounce buffer memory is shared with the host. For non-realm guests or hosts this is a no-op. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Co-developed-by: Suzuki K Poulose Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-8-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/rsi.c | 1 + arch/arm64/mm/init.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index 3e0c83e2296f..a23c0a7154d2 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 27a32ff15412..d21f67d67cf5 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -366,8 +367,14 @@ void __init bootmem_init(void) */ void __init mem_init(void) { + unsigned int flags = SWIOTLB_VERBOSE; bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit); + if (is_realm_world()) { + swiotlb = true; + flags |= SWIOTLB_FORCE; + } + if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) { /* * If no bouncing needed for ZONE_DMA, reduce the swiotlb @@ -379,7 +386,8 @@ void __init mem_init(void) swiotlb = true; } - swiotlb_init(swiotlb, SWIOTLB_VERBOSE); + swiotlb_init(swiotlb, flags); + swiotlb_update_mem_attributes(); /* this will put all unused low memory onto the freelists */ memblock_free_all(); From 0e9cb5995b2539a332fe65ada6a28a6be55f6e40 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Thu, 17 Oct 2024 14:14:31 +0100 Subject: [PATCH 097/168] arm64: mm: Avoid TLBI when marking pages as valid When __change_memory_common() is purely setting the valid bit on a PTE (e.g. via the set_memory_valid() call) there is no need for a TLBI as either the entry isn't changing (the valid bit was already set) or the entry was invalid and so should not have been cached in the TLB. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-9-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/pageattr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0e270a1c51e6..547a9e0b46c2 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -60,7 +60,13 @@ static int __change_memory_common(unsigned long start, unsigned long size, ret = apply_to_page_range(&init_mm, start, size, change_page_range, &data); - flush_tlb_kernel_range(start, start + size); + /* + * If the memory is being made valid without changing any other bits + * then a TLBI isn't required as a non-valid entry cannot be cached in + * the TLB. + */ + if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask)) + flush_tlb_kernel_range(start, start + size); return ret; } From 42be24a4178fe51e6f47d91d8621b2f53820f88b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Thu, 17 Oct 2024 14:14:32 +0100 Subject: [PATCH 098/168] arm64: Enable memory encrypt for Realms Use the memory encryption APIs to trigger a RSI call to request a transition between protected memory and shared memory (or vice versa) and updating the kernel's linear map of modified pages to flip the top bit of the IPA. This requires that block mappings are not used in the direct map for realm guests. Reviewed-by: Catalin Marinas Reviewed-by: Gavin Shan Signed-off-by: Suzuki K Poulose Co-developed-by: Steven Price Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-10-steven.price@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 3 + arch/arm64/include/asm/mem_encrypt.h | 9 +++ arch/arm64/include/asm/pgtable.h | 5 ++ arch/arm64/include/asm/set_memory.h | 3 + arch/arm64/kernel/rsi.c | 16 +++++ arch/arm64/mm/pageattr.c | 90 +++++++++++++++++++++++++++- 6 files changed, 123 insertions(+), 3 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3e29b44d2d7b..ccea9c22d6df 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -21,6 +21,7 @@ config ARM64 select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CC_PLATFORM select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -44,6 +45,8 @@ config ARM64 select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED select ARCH_STACKWALK select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h index b0c9a86b13a4..f8f78f622dd2 100644 --- a/arch/arm64/include/asm/mem_encrypt.h +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -2,6 +2,8 @@ #ifndef __ASM_MEM_ENCRYPT_H #define __ASM_MEM_ENCRYPT_H +#include + struct arm64_mem_crypt_ops { int (*encrypt)(unsigned long addr, int numpages); int (*decrypt)(unsigned long addr, int numpages); @@ -12,4 +14,11 @@ int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); +int realm_register_memory_enc_ops(void); + +static inline bool force_dma_unencrypted(struct device *dev) +{ + return is_realm_world(); +} + #endif /* __ASM_MEM_ENCRYPT_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c329ea061dc9..7e4bdc8259a2 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -684,6 +684,11 @@ static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, #define pgprot_nx(prot) \ __pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN) +#define pgprot_decrypted(prot) \ + __pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED) +#define pgprot_encrypted(prot) \ + __pgprot_modify(prot, PROT_NS_SHARED, 0) + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 917761feeffd..37774c793006 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -15,4 +15,7 @@ int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); bool kernel_page_present(struct page *page); +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + #endif /* _ASM_ARM64_SET_MEMORY_H */ diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c index a23c0a7154d2..3031f25c32ef 100644 --- a/arch/arm64/kernel/rsi.c +++ b/arch/arm64/kernel/rsi.c @@ -7,8 +7,10 @@ #include #include #include +#include #include +#include #include static struct realm_config config; @@ -19,6 +21,17 @@ EXPORT_SYMBOL(prot_ns_shared); DEFINE_STATIC_KEY_FALSE_RO(rsi_present); EXPORT_SYMBOL(rsi_present); +bool cc_platform_has(enum cc_attr attr) +{ + switch (attr) { + case CC_ATTR_MEM_ENCRYPT: + return is_realm_world(); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(cc_platform_has); + static bool rsi_version_matches(void) { unsigned long ver_lower, ver_higher; @@ -119,6 +132,9 @@ void __init arm64_rsi_init(void) if (arm64_ioremap_prot_hook_register(realm_ioremap_hook)) return; + if (realm_register_memory_enc_ops()) + return; + arm64_rsi_setup_memory(); static_branch_enable(&rsi_present); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 547a9e0b46c2..6ae6ae806454 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -5,10 +5,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -23,14 +25,16 @@ bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED bool can_set_direct_map(void) { /* - * rodata_full and DEBUG_PAGEALLOC require linear map to be - * mapped at page granularity, so that it is possible to + * rodata_full, DEBUG_PAGEALLOC and a Realm guest all require linear + * map to be mapped at page granularity, so that it is possible to * protect/unprotect single pages. * * KFENCE pool requires page-granular mapping if initialized late. + * + * Realms need to make pages shared/protected at page granularity. */ return rodata_full || debug_pagealloc_enabled() || - arm64_kfence_can_set_direct_map(); + arm64_kfence_can_set_direct_map() || is_realm_world(); } static int change_page_range(pte_t *ptep, unsigned long addr, void *data) @@ -198,6 +202,86 @@ int set_direct_map_default_noflush(struct page *page) PAGE_SIZE, change_page_range, &data); } +static int __set_memory_enc_dec(unsigned long addr, + int numpages, + bool encrypt) +{ + unsigned long set_prot = 0, clear_prot = 0; + phys_addr_t start, end; + int ret; + + if (!is_realm_world()) + return 0; + + if (!__is_lm_address(addr)) + return -EINVAL; + + start = __virt_to_phys(addr); + end = start + numpages * PAGE_SIZE; + + if (encrypt) + clear_prot = PROT_NS_SHARED; + else + set_prot = PROT_NS_SHARED; + + /* + * Break the mapping before we make any changes to avoid stale TLB + * entries or Synchronous External Aborts caused by RIPAS_EMPTY + */ + ret = __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(set_prot), + __pgprot(clear_prot | PTE_VALID)); + + if (ret) + return ret; + + if (encrypt) + ret = rsi_set_memory_range_protected(start, end); + else + ret = rsi_set_memory_range_shared(start, end); + + if (ret) + return ret; + + return __change_memory_common(addr, PAGE_SIZE * numpages, + __pgprot(PTE_VALID), + __pgprot(0)); +} + +static int realm_set_memory_encrypted(unsigned long addr, int numpages) +{ + int ret = __set_memory_enc_dec(addr, numpages, true); + + /* + * If the request to change state fails, then the only sensible cause + * of action for the caller is to leak the memory + */ + WARN(ret, "Failed to encrypt memory, %d pages will be leaked", + numpages); + + return ret; +} + +static int realm_set_memory_decrypted(unsigned long addr, int numpages) +{ + int ret = __set_memory_enc_dec(addr, numpages, false); + + WARN(ret, "Failed to decrypt memory, %d pages will be leaked", + numpages); + + return ret; +} + +static const struct arm64_mem_crypt_ops realm_crypt_ops = { + .encrypt = realm_set_memory_encrypted, + .decrypt = realm_set_memory_decrypted, +}; + +int realm_register_memory_enc_ops(void) +{ + return arm64_mem_crypt_ops_register(&realm_crypt_ops); +} + #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { From 7999edc484ca376f803562edb2d43ec921642c2a Mon Sep 17 00:00:00 2001 From: Sami Mujawar Date: Thu, 17 Oct 2024 14:14:33 +0100 Subject: [PATCH 099/168] virt: arm-cca-guest: TSM_REPORT support for realms Introduce an arm-cca-guest driver that registers with the configfs-tsm module to provide user interfaces for retrieving an attestation token. When a new report is requested the arm-cca-guest driver invokes the appropriate RSI interfaces to query an attestation token. The steps to retrieve an attestation token are as follows: 1. Mount the configfs filesystem if not already mounted mount -t configfs none /sys/kernel/config 2. Generate an attestation token report=/sys/kernel/config/tsm/report/report0 mkdir $report dd if=/dev/urandom bs=64 count=1 > $report/inblob hexdump -C $report/outblob rmdir $report Signed-off-by: Sami Mujawar Signed-off-by: Suzuki K Poulose Signed-off-by: Steven Price Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20241017131434.40935-11-steven.price@arm.com Signed-off-by: Catalin Marinas --- drivers/virt/coco/Kconfig | 2 + drivers/virt/coco/Makefile | 1 + drivers/virt/coco/arm-cca-guest/Kconfig | 11 + drivers/virt/coco/arm-cca-guest/Makefile | 2 + .../virt/coco/arm-cca-guest/arm-cca-guest.c | 224 ++++++++++++++++++ 5 files changed, 240 insertions(+) create mode 100644 drivers/virt/coco/arm-cca-guest/Kconfig create mode 100644 drivers/virt/coco/arm-cca-guest/Makefile create mode 100644 drivers/virt/coco/arm-cca-guest/arm-cca-guest.c diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig index d9ff676bf48d..ff869d883d95 100644 --- a/drivers/virt/coco/Kconfig +++ b/drivers/virt/coco/Kconfig @@ -14,3 +14,5 @@ source "drivers/virt/coco/pkvm-guest/Kconfig" source "drivers/virt/coco/sev-guest/Kconfig" source "drivers/virt/coco/tdx-guest/Kconfig" + +source "drivers/virt/coco/arm-cca-guest/Kconfig" diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile index b69c30c1c720..c3d07cfc087e 100644 --- a/drivers/virt/coco/Makefile +++ b/drivers/virt/coco/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_EFI_SECRET) += efi_secret/ obj-$(CONFIG_ARM_PKVM_GUEST) += pkvm-guest/ obj-$(CONFIG_SEV_GUEST) += sev-guest/ obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/ +obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest/ diff --git a/drivers/virt/coco/arm-cca-guest/Kconfig b/drivers/virt/coco/arm-cca-guest/Kconfig new file mode 100644 index 000000000000..9dd27c3ee215 --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/Kconfig @@ -0,0 +1,11 @@ +config ARM_CCA_GUEST + tristate "Arm CCA Guest driver" + depends on ARM64 + default m + select TSM_REPORTS + help + The driver provides userspace interface to request and + attestation report from the Realm Management Monitor(RMM). + + If you choose 'M' here, this module will be called + arm-cca-guest. diff --git a/drivers/virt/coco/arm-cca-guest/Makefile b/drivers/virt/coco/arm-cca-guest/Makefile new file mode 100644 index 000000000000..69eeba08e98a --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest.o diff --git a/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c new file mode 100644 index 000000000000..488153879ec9 --- /dev/null +++ b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023 ARM Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/** + * struct arm_cca_token_info - a descriptor for the token buffer. + * @challenge: Pointer to the challenge data + * @challenge_size: Size of the challenge data + * @granule: PA of the granule to which the token will be written + * @offset: Offset within granule to start of buffer in bytes + * @result: result of rsi_attestation_token_continue operation + */ +struct arm_cca_token_info { + void *challenge; + unsigned long challenge_size; + phys_addr_t granule; + unsigned long offset; + unsigned long result; +}; + +static void arm_cca_attestation_init(void *param) +{ + struct arm_cca_token_info *info; + + info = (struct arm_cca_token_info *)param; + + info->result = rsi_attestation_token_init(info->challenge, + info->challenge_size); +} + +/** + * arm_cca_attestation_continue - Retrieve the attestation token data. + * + * @param: pointer to the arm_cca_token_info + * + * Attestation token generation is a long running operation and therefore + * the token data may not be retrieved in a single call. Moreover, the + * token retrieval operation must be requested on the same CPU on which the + * attestation token generation was initialised. + * This helper function is therefore scheduled on the same CPU multiple + * times until the entire token data is retrieved. + */ +static void arm_cca_attestation_continue(void *param) +{ + unsigned long len; + unsigned long size; + struct arm_cca_token_info *info; + + info = (struct arm_cca_token_info *)param; + + size = RSI_GRANULE_SIZE - info->offset; + info->result = rsi_attestation_token_continue(info->granule, + info->offset, size, &len); + info->offset += len; +} + +/** + * arm_cca_report_new - Generate a new attestation token. + * + * @report: pointer to the TSM report context information. + * @data: pointer to the context specific data for this module. + * + * Initialise the attestation token generation using the challenge data + * passed in the TSM descriptor. Allocate memory for the attestation token + * and schedule calls to retrieve the attestation token on the same CPU + * on which the attestation token generation was initialised. + * + * The challenge data must be at least 32 bytes and no more than 64 bytes. If + * less than 64 bytes are provided it will be zero padded to 64 bytes. + * + * Return: + * * %0 - Attestation token generated successfully. + * * %-EINVAL - A parameter was not valid. + * * %-ENOMEM - Out of memory. + * * %-EFAULT - Failed to get IPA for memory page(s). + * * A negative status code as returned by smp_call_function_single(). + */ +static int arm_cca_report_new(struct tsm_report *report, void *data) +{ + int ret; + int cpu; + long max_size; + unsigned long token_size = 0; + struct arm_cca_token_info info; + void *buf; + u8 *token __free(kvfree) = NULL; + struct tsm_desc *desc = &report->desc; + + if (desc->inblob_len < 32 || desc->inblob_len > 64) + return -EINVAL; + + /* + * The attestation token 'init' and 'continue' calls must be + * performed on the same CPU. smp_call_function_single() is used + * instead of simply calling get_cpu() because of the need to + * allocate outblob based on the returned value from the 'init' + * call and that cannot be done in an atomic context. + */ + cpu = smp_processor_id(); + + info.challenge = desc->inblob; + info.challenge_size = desc->inblob_len; + + ret = smp_call_function_single(cpu, arm_cca_attestation_init, + &info, true); + if (ret) + return ret; + max_size = info.result; + + if (max_size <= 0) + return -EINVAL; + + /* Allocate outblob */ + token = kvzalloc(max_size, GFP_KERNEL); + if (!token) + return -ENOMEM; + + /* + * Since the outblob may not be physically contiguous, use a page + * to bounce the buffer from RMM. + */ + buf = alloc_pages_exact(RSI_GRANULE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Get the PA of the memory page(s) that were allocated */ + info.granule = (unsigned long)virt_to_phys(buf); + + /* Loop until the token is ready or there is an error */ + do { + /* Retrieve one RSI_GRANULE_SIZE data per loop iteration */ + info.offset = 0; + do { + /* + * Schedule a call to retrieve a sub-granule chunk + * of data per loop iteration. + */ + ret = smp_call_function_single(cpu, + arm_cca_attestation_continue, + (void *)&info, true); + if (ret != 0) { + token_size = 0; + goto exit_free_granule_page; + } + } while (info.result == RSI_INCOMPLETE && + info.offset < RSI_GRANULE_SIZE); + + if (info.result != RSI_SUCCESS) { + ret = -ENXIO; + token_size = 0; + goto exit_free_granule_page; + } + + /* + * Copy the retrieved token data from the granule + * to the token buffer, ensuring that the RMM doesn't + * overflow the buffer. + */ + if (WARN_ON(token_size + info.offset > max_size)) + break; + memcpy(&token[token_size], buf, info.offset); + token_size += info.offset; + } while (info.result == RSI_INCOMPLETE); + + report->outblob = no_free_ptr(token); +exit_free_granule_page: + report->outblob_len = token_size; + free_pages_exact(buf, RSI_GRANULE_SIZE); + return ret; +} + +static const struct tsm_ops arm_cca_tsm_ops = { + .name = KBUILD_MODNAME, + .report_new = arm_cca_report_new, +}; + +/** + * arm_cca_guest_init - Register with the Trusted Security Module (TSM) + * interface. + * + * Return: + * * %0 - Registered successfully with the TSM interface. + * * %-ENODEV - The execution context is not an Arm Realm. + * * %-EBUSY - Already registered. + */ +static int __init arm_cca_guest_init(void) +{ + int ret; + + if (!is_realm_world()) + return -ENODEV; + + ret = tsm_register(&arm_cca_tsm_ops, NULL); + if (ret < 0) + pr_err("Error %d registering with TSM\n", ret); + + return ret; +} +module_init(arm_cca_guest_init); + +/** + * arm_cca_guest_exit - unregister with the Trusted Security Module (TSM) + * interface. + */ +static void __exit arm_cca_guest_exit(void) +{ + tsm_unregister(&arm_cca_tsm_ops); +} +module_exit(arm_cca_guest_exit); + +MODULE_AUTHOR("Sami Mujawar "); +MODULE_DESCRIPTION("Arm CCA Guest TSM Driver"); +MODULE_LICENSE("GPL"); From 972d755f01954bd0e36d8696f0d7dc6466072c21 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Thu, 17 Oct 2024 14:14:34 +0100 Subject: [PATCH 100/168] arm64: Document Arm Confidential Compute Add some documentation on Arm CCA and the requirements for running Linux as a Realm guest. Also update booting.rst to describe the requirement for RIPAS RAM. Reviewed-by: Gavin Shan Reviewed-by: Suzuki K Poulose Signed-off-by: Steven Price Link: https://lore.kernel.org/r/20241017131434.40935-12-steven.price@arm.com Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/arm-cca.rst | 69 ++++++++++++++++++++++++++++ Documentation/arch/arm64/booting.rst | 3 ++ Documentation/arch/arm64/index.rst | 1 + 3 files changed, 73 insertions(+) create mode 100644 Documentation/arch/arm64/arm-cca.rst diff --git a/Documentation/arch/arm64/arm-cca.rst b/Documentation/arch/arm64/arm-cca.rst new file mode 100644 index 000000000000..c48b7d4ab6bd --- /dev/null +++ b/Documentation/arch/arm64/arm-cca.rst @@ -0,0 +1,69 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Arm Confidential Compute Architecture +===================================== + +Arm systems that support the Realm Management Extension (RME) contain +hardware to allow a VM guest to be run in a way which protects the code +and data of the guest from the hypervisor. It extends the older "two +world" model (Normal and Secure World) into four worlds: Normal, Secure, +Root and Realm. Linux can then also be run as a guest to a monitor +running in the Realm world. + +The monitor running in the Realm world is known as the Realm Management +Monitor (RMM) and implements the Realm Management Monitor +specification[1]. The monitor acts a bit like a hypervisor (e.g. it runs +in EL2 and manages the stage 2 page tables etc of the guests running in +Realm world), however much of the control is handled by a hypervisor +running in the Normal World. The Normal World hypervisor uses the Realm +Management Interface (RMI) defined by the RMM specification to request +the RMM to perform operations (e.g. mapping memory or executing a vCPU). + +The RMM defines an environment for guests where the address space (IPA) +is split into two. The lower half is protected - any memory that is +mapped in this half cannot be seen by the Normal World and the RMM +restricts what operations the Normal World can perform on this memory +(e.g. the Normal World cannot replace pages in this region without the +guest's cooperation). The upper half is shared, the Normal World is free +to make changes to the pages in this region, and is able to emulate MMIO +devices in this region too. + +A guest running in a Realm may also communicate with the RMM using the +Realm Services Interface (RSI) to request changes in its environment or +to perform attestation about its environment. In particular it may +request that areas of the protected address space are transitioned +between 'RAM' and 'EMPTY' (in either direction). This allows a Realm +guest to give up memory to be returned to the Normal World, or to +request new memory from the Normal World. Without an explicit request +from the Realm guest the RMM will otherwise prevent the Normal World +from making these changes. + +Linux as a Realm Guest +---------------------- + +To run Linux as a guest within a Realm, the following must be provided +either by the VMM or by a `boot loader` run in the Realm before Linux: + + * All protected RAM described to Linux (by DT or ACPI) must be marked + RIPAS RAM before handing control over to Linux. + + * MMIO devices must be either unprotected (e.g. emulated by the Normal + World) or marked RIPAS DEV. + + * MMIO devices emulated by the Normal World and used very early in boot + (specifically earlycon) must be specified in the upper half of IPA. + For earlycon this can be done by specifying the address on the + command line, e.g. with an IPA size of 33 bits and the base address + of the emulated UART at 0x1000000: ``earlycon=uart,mmio,0x101000000`` + + * Linux will use bounce buffers for communicating with unprotected + devices. It will transition some protected memory to RIPAS EMPTY and + expect to be able to access unprotected pages at the same IPA address + but with the highest valid IPA bit set. The expectation is that the + VMM will remove the physical pages from the protected mapping and + provide those pages as unprotected pages. + +References +---------- +[1] https://developer.arm.com/documentation/den0137/ diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index b57776a68f15..30164fb24a24 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -41,6 +41,9 @@ to automatically locate and size all RAM, or it may use knowledge of the RAM in the machine, or any other method the boot loader designer sees fit.) +For Arm Confidential Compute Realms this includes ensuring that all +protected RAM has a Realm IPA state (RIPAS) of "RAM". + 2. Setup the device tree ------------------------- diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index 78544de0a8a9..12c243c3af20 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -10,6 +10,7 @@ ARM64 Architecture acpi_object_usage amu arm-acpi + arm-cca asymmetric-32bit booting cpu-feature-registers From 358dd4a9bdac63a0a8fb13773bfce6f599e25433 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 21 Oct 2024 19:14:34 +0100 Subject: [PATCH 101/168] arm64: Add command-line override for ID_AA64MMFR0_EL1.ECV It appears that relatively popular hardware out there implements the CNTPOFF_EL2 variant of FEAT_ECV, advertises it via ID_AA64MMFR0_EL1, but cannot be bothered to set SCR_EL3.ECVEn to 1. You would probably think that "this is fine, EL3 will take the trap on access to CNTPOFF_EL2 and flip the ECVEn bit", as that's what a semi-decent firmware implementation would do. But no. None of that. This particular implementation takes the trap, considers its purpose in life, decides that it has none, and *RESETS* the system. Yes, x1e001de, I'm talking about you. In order to allow this machine to be promoted slightly above the level of a glorified door-stop, add a new "id_aa64mmfr0.ecv" override. allowing the kernel to pretend this option was never there. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20241021181434.1052974-1-maz@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/pi/idreg-override.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index 29d4b6244a6f..fbc37b2733e2 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -38,6 +38,15 @@ struct ftr_set_desc { #define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f } +static const struct ftr_set_desc mmfr0 __prel64_initconst = { + .name = "id_aa64mmfr0", + .override = &id_aa64mmfr0_override, + .fields = { + FIELD("ecv", ID_AA64MMFR0_EL1_ECV_SHIFT, NULL), + {} + }, +}; + static bool __init mmfr1_vh_filter(u64 val) { /* @@ -196,6 +205,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = { static const PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = { + { &mmfr0 }, { &mmfr1 }, { &mmfr2 }, { &pfr0 }, From dca93d29845dfed60910ba13dbfb6ae6a0e19f6d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 23 Oct 2024 00:20:45 +0100 Subject: [PATCH 102/168] kselftest/arm64: Log fp-stress child startup errors to stdout Currently if we encounter an error between fork() and exec() of a child process we log the error to stderr. This means that the errors don't get annotated with the child information which makes diagnostics harder and means that if we miss the exit signal from the child we can deadlock waiting for output from the child. Improve robustness and output quality by logging to stdout instead. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241023-arm64-fp-stress-exec-fail-v1-1-ee3c62932c15@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fp-stress.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index e62c9dbad501..13958e645afc 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -79,7 +79,7 @@ static void child_start(struct child_data *child, const char *program) */ ret = dup2(pipefd[1], 1); if (ret == -1) { - fprintf(stderr, "dup2() %d\n", errno); + printf("dup2() %d\n", errno); exit(EXIT_FAILURE); } @@ -89,7 +89,7 @@ static void child_start(struct child_data *child, const char *program) */ ret = dup2(startup_pipe[0], 3); if (ret == -1) { - fprintf(stderr, "dup2() %d\n", errno); + printf("dup2() %d\n", errno); exit(EXIT_FAILURE); } @@ -107,16 +107,15 @@ static void child_start(struct child_data *child, const char *program) */ ret = read(3, &i, sizeof(i)); if (ret < 0) - fprintf(stderr, "read(startp pipe) failed: %s (%d)\n", - strerror(errno), errno); + printf("read(startp pipe) failed: %s (%d)\n", + strerror(errno), errno); if (ret > 0) - fprintf(stderr, "%d bytes of data on startup pipe\n", - ret); + printf("%d bytes of data on startup pipe\n", ret); close(3); ret = execl(program, program, NULL); - fprintf(stderr, "execl(%s) failed: %d (%s)\n", - program, errno, strerror(errno)); + printf("execl(%s) failed: %d (%s)\n", + program, errno, strerror(errno)); exit(EXIT_FAILURE); } else { From 0448a96e243d7ae0e2db3a633d0f2307be3aa8b7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 21 Oct 2024 12:07:13 +0530 Subject: [PATCH 103/168] arm64/mm: Drop _PROT_SECT_DEFAULT 'commit db95ea787bd1 ("arm64: mm: Wire up TCR.DS bit to PTE shareability fields")' dropped the last reference to symbol _PROT_SECT_DEFAULT, while transitioning from PMD_SECT_S to PMD_MAYBE_SHARED for PROT_SECT_DEFAULT. Hence let's just drop that symbol which is now unused. Cc: Will Deacon Cc: Ryan Roberts Cc: Ard Biesheuvel Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241021063713.750870-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-prot.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 2a11d0c10760..84a099989f82 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -35,7 +35,6 @@ #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) -#define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF) #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF) From 0263a1e4f5ddbdac5ad84e84e5ac75dcb20121a1 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 24 Sep 2024 14:12:49 +0800 Subject: [PATCH 104/168] dt-bindings: perf: fsl-imx-ddr: Add i.MX91 compatible i.MX91 has a DDR Performance Monitor Unit which is compatible with i.MX93. This will add a compatible for i.MX91. Signed-off-by: Xu Yang Reviewed-by: Frank Li Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20240924061251.3387850-1-xu.yang_2@nxp.com Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml index 37e8b98f2cdc..8597ea625edb 100644 --- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml +++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml @@ -31,7 +31,9 @@ properties: - const: fsl,imx8dxl-ddr-pmu - const: fsl,imx8-ddr-pmu - items: - - const: fsl,imx95-ddr-pmu + - enum: + - fsl,imx91-ddr-pmu + - fsl,imx95-ddr-pmu - const: fsl,imx93-ddr-pmu reg: From 44798fe136dc5adc9733a5c3d1fddafd6ec942ff Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 24 Sep 2024 14:12:50 +0800 Subject: [PATCH 105/168] perf: imx_perf: add support for i.MX91 platform This will add compatible and identifier for i.MX91 platform. Signed-off-by: Xu Yang Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20240924061251.3387850-2-xu.yang_2@nxp.com Signed-off-by: Will Deacon --- drivers/perf/fsl_imx9_ddr_perf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c index 69f920b1caf2..fe1a51f64751 100644 --- a/drivers/perf/fsl_imx9_ddr_perf.c +++ b/drivers/perf/fsl_imx9_ddr_perf.c @@ -81,6 +81,10 @@ struct ddr_pmu { int id; }; +static const struct imx_ddr_devtype_data imx91_devtype_data = { + .identifier = "imx91", +}; + static const struct imx_ddr_devtype_data imx93_devtype_data = { .identifier = "imx93", }; @@ -100,6 +104,7 @@ static inline bool is_imx95(struct ddr_pmu *pmu) } static const struct of_device_id imx_ddr_pmu_dt_ids[] = { + { .compatible = "fsl,imx91-ddr-pmu", .data = &imx91_devtype_data }, { .compatible = "fsl,imx93-ddr-pmu", .data = &imx93_devtype_data }, { .compatible = "fsl,imx95-ddr-pmu", .data = &imx95_devtype_data }, { /* sentinel */ } From 48545b3eff6b061ed616d3ed67ca215abfb1000d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 9 Oct 2024 19:52:08 -0700 Subject: [PATCH 106/168] perf/cxlpmu: Support missing events in 3.1 spec Update the CXL PMU driver to support the new events introduced in the latest revision. These are: - read/write accesses with TEE constraints. - S2M indicating Modified state. Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Signed-off-by: Davidlohr Bueso Link: https://lore.kernel.org/r/20241010025208.180458-1-dave@stgolabs.net Signed-off-by: Will Deacon --- drivers/perf/cxl_pmu.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/perf/cxl_pmu.c b/drivers/perf/cxl_pmu.c index 43d68b69e630..bee4b5b52ec6 100644 --- a/drivers/perf/cxl_pmu.c +++ b/drivers/perf/cxl_pmu.c @@ -354,7 +354,7 @@ static struct attribute *cxl_pmu_event_attrs[] = { CXL_PMU_EVENT_CXL_ATTR(d2h_req_wowrinvf, CXL_PMU_GID_D2H_REQ, BIT(13)), CXL_PMU_EVENT_CXL_ATTR(d2h_req_wrinv, CXL_PMU_GID_D2H_REQ, BIT(14)), CXL_PMU_EVENT_CXL_ATTR(d2h_req_cacheflushed, CXL_PMU_GID_D2H_REQ, BIT(16)), - /* CXL rev 3.0 Table 3-20 - D2H Repsonse Encodings */ + /* CXL rev 3.0 Table 3-20 - D2H Response Encodings */ CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspihiti, CXL_PMU_GID_D2H_RSP, BIT(4)), CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspvhitv, CXL_PMU_GID_D2H_RSP, BIT(6)), CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspihitse, CXL_PMU_GID_D2H_RSP, BIT(5)), @@ -377,12 +377,14 @@ static struct attribute *cxl_pmu_event_attrs[] = { /* CXL rev 3.0 Table 13-5 directly lists these */ CXL_PMU_EVENT_CXL_ATTR(cachedata_d2h_data, CXL_PMU_GID_CACHE_DATA, BIT(0)), CXL_PMU_EVENT_CXL_ATTR(cachedata_h2d_data, CXL_PMU_GID_CACHE_DATA, BIT(1)), - /* CXL rev 3.0 Table 3-29 M2S Req Memory Opcodes */ + /* CXL rev 3.1 Table 3-35 M2S Req Memory Opcodes */ CXL_PMU_EVENT_CXL_ATTR(m2s_req_meminv, CXL_PMU_GID_M2S_REQ, BIT(0)), CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrd, CXL_PMU_GID_M2S_REQ, BIT(1)), CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrddata, CXL_PMU_GID_M2S_REQ, BIT(2)), CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrdfwd, CXL_PMU_GID_M2S_REQ, BIT(3)), CXL_PMU_EVENT_CXL_ATTR(m2s_req_memwrfwd, CXL_PMU_GID_M2S_REQ, BIT(4)), + CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrdtee, CXL_PMU_GID_M2S_REQ, BIT(5)), + CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrddatatee, CXL_PMU_GID_M2S_REQ, BIT(6)), CXL_PMU_EVENT_CXL_ATTR(m2s_req_memspecrd, CXL_PMU_GID_M2S_REQ, BIT(8)), CXL_PMU_EVENT_CXL_ATTR(m2s_req_meminvnt, CXL_PMU_GID_M2S_REQ, BIT(9)), CXL_PMU_EVENT_CXL_ATTR(m2s_req_memcleanevict, CXL_PMU_GID_M2S_REQ, BIT(10)), @@ -404,10 +406,11 @@ static struct attribute *cxl_pmu_event_attrs[] = { CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_curblk, CXL_PMU_GID_S2M_BISNP, BIT(4)), CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_datblk, CXL_PMU_GID_S2M_BISNP, BIT(5)), CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_invblk, CXL_PMU_GID_S2M_BISNP, BIT(6)), - /* CXL rev 3.0 Table 3-43 S2M NDR Opcopdes */ + /* CXL rev 3.1 Table 3-50 S2M NDR Opcopdes */ CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmp, CXL_PMU_GID_S2M_NDR, BIT(0)), CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmps, CXL_PMU_GID_S2M_NDR, BIT(1)), CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmpe, CXL_PMU_GID_S2M_NDR, BIT(2)), + CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmpm, CXL_PMU_GID_S2M_NDR, BIT(3)), CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_biconflictack, CXL_PMU_GID_S2M_NDR, BIT(4)), /* CXL rev 3.0 Table 3-46 S2M DRS opcodes */ CXL_PMU_EVENT_CXL_ATTR(s2m_drs_memdata, CXL_PMU_GID_S2M_DRS, BIT(0)), From 759b5fc6cc3e3c5841f6a3e4638b39534b0fc716 Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Wed, 16 Oct 2024 14:01:36 -0700 Subject: [PATCH 107/168] perf/dwc_pcie: Convert the events with mixed case to lowercase Group #1 events had both upper and lower case characters in their names. Trying to count such events with perf tool results in an error: $ perf stat -e dwc_rootport_10008/Tx_PCIe_TLP_Data_Payload/ sleep 1 event syntax error: 'dwc_rootport_10008/Tx_PCIe_TLP_Data_Payload/' \___ Bad event or PMU Unable to find PMU or event on a PMU of 'dwc_rootport_10008' event syntax error: '..port_10008/Tx_PCIe_TLP_Data_Payload/' \___ unknown term 'Tx_PCIe_TLP_Data_Payload' for pmu 'dwc_rootport_10008' valid terms: eventid,type,lane,config,config1,config2,config3,name,period,percore,metric-id Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events Perf tool assumes the event names are either in lower or upper case. This is also mentioned in Documentation/ABI/testing/sysfs-bus-event_source-devices-events "As performance monitoring event names are case insensitive in the perf tool, the perf tool only looks for lower or upper case event names in sysfs to avoid scanning the directory. It is therefore required the name of the event here is either lower or upper case." Change the Group #1 events names to lower case. Signed-off-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20241016210136.65452-1-ilkka@os.amperecomputing.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 59526a48499f..126d2c3516ad 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -202,10 +202,10 @@ static struct attribute *dwc_pcie_pmu_time_event_attrs[] = { DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_AUX, 0x09), /* Group #1 */ - DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20), - DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21), - DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22), - DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(tx_pcie_tlp_data_payload, 0x20), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(rx_pcie_tlp_data_payload, 0x21), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(tx_ccix_tlp_data_payload, 0x22), + DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(rx_ccix_tlp_data_payload, 0x23), /* * Leave it to the user to specify the lane ID to avoid generating From 0bbff9ed81654d5f06bfca484681756ee407f924 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 2 Oct 2024 13:43:24 -0500 Subject: [PATCH 108/168] perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control Armv8.9/9.4 PMUv3.9 adds per counter EL0 access controls. Per counter access is enabled with the UEN bit in PMUSERENR_EL1 register. Individual counters are enabled/disabled in the PMUACR_EL1 register. When UEN is set, the CR/ER bits control EL0 write access and must be set to disable write access. With the access controls, the clearing of unused counters can be skipped. KVM also configures PMUSERENR_EL1 in order to trap to EL2. UEN does not need to be set for it since only PMUv3.5 is exposed to guests. Signed-off-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20241002184326.1105499-1-robh@kernel.org Signed-off-by: Will Deacon --- arch/arm/include/asm/arm_pmuv3.h | 6 ++++++ arch/arm64/include/asm/arm_pmuv3.h | 10 ++++++++++ arch/arm64/tools/sysreg | 8 ++++++++ drivers/perf/arm_pmuv3.c | 29 +++++++++++++++++++---------- include/linux/perf/arm_pmuv3.h | 1 + 5 files changed, 44 insertions(+), 10 deletions(-) diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index f63ba8986b24..d242b5e1ca0d 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -231,6 +231,7 @@ static inline void kvm_vcpu_pmu_resync_el0(void) {} #define ARMV8_PMU_DFR_VER_V3P1 0x4 #define ARMV8_PMU_DFR_VER_V3P4 0x5 #define ARMV8_PMU_DFR_VER_V3P5 0x6 +#define ARMV8_PMU_DFR_VER_V3P9 0x9 #define ARMV8_PMU_DFR_VER_IMP_DEF 0xF static inline bool pmuv3_implemented(int pmuver) @@ -249,6 +250,11 @@ static inline bool is_pmuv3p5(int pmuver) return pmuver >= ARMV8_PMU_DFR_VER_V3P5; } +static inline bool is_pmuv3p9(int pmuver) +{ + return pmuver >= ARMV8_PMU_DFR_VER_V3P9; +} + static inline u64 read_pmceid0(void) { u64 val = read_sysreg(PMCEID0); diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index 468a049bc63b..8a777dec8d88 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -152,6 +152,11 @@ static inline void write_pmuserenr(u32 val) write_sysreg(val, pmuserenr_el0); } +static inline void write_pmuacr(u64 val) +{ + write_sysreg_s(val, SYS_PMUACR_EL1); +} + static inline u64 read_pmceid0(void) { return read_sysreg(pmceid0_el0); @@ -178,4 +183,9 @@ static inline bool is_pmuv3p5(int pmuver) return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5; } +static inline bool is_pmuv3p9(int pmuver) +{ + return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P9; +} + #endif diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8d637ac4b7c6..74fb5af91d4f 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1238,6 +1238,7 @@ UnsignedEnum 11:8 PMUVer 0b0110 V3P5 0b0111 V3P7 0b1000 V3P8 + 0b1001 V3P9 0b1111 IMP_DEF EndEnum UnsignedEnum 7:4 TraceVer @@ -2178,6 +2179,13 @@ Field 4 P Field 3:0 ALIGN EndSysreg +Sysreg PMUACR_EL1 3 0 9 14 4 +Res0 63:33 +Field 32 F0 +Field 31 C +Field 30:0 P +EndSysreg + Sysreg PMSELR_EL0 3 3 9 12 5 Res0 63:5 Field 4:0 SEL diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 0afe02f879b4..bb93d32b86ea 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -770,18 +770,27 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu) int i; struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events); - /* Clear any unused counters to avoid leaking their contents */ - for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask, - ARMPMU_MAX_HWEVENTS) { - if (i == ARMV8_PMU_CYCLE_IDX) - write_pmccntr(0); - else if (i == ARMV8_PMU_INSTR_IDX) - write_pmicntr(0); - else - armv8pmu_write_evcntr(i, 0); + if (is_pmuv3p9(cpu_pmu->pmuver)) { + u64 mask = 0; + for_each_set_bit(i, cpuc->used_mask, ARMPMU_MAX_HWEVENTS) { + if (armv8pmu_event_has_user_read(cpuc->events[i])) + mask |= BIT(i); + } + write_pmuacr(mask); + } else { + /* Clear any unused counters to avoid leaking their contents */ + for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask, + ARMPMU_MAX_HWEVENTS) { + if (i == ARMV8_PMU_CYCLE_IDX) + write_pmccntr(0); + else if (i == ARMV8_PMU_INSTR_IDX) + write_pmicntr(0); + else + armv8pmu_write_evcntr(i, 0); + } } - update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR); + update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_UEN); } static void armv8pmu_enable_event(struct perf_event *event) diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 3372c1b56486..d698efba28a2 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -257,6 +257,7 @@ #define ARMV8_PMU_USERENR_SW (1 << 1) /* PMSWINC can be written at EL0 */ #define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ +#define ARMV8_PMU_USERENR_UEN (1 << 4) /* Fine grained per counter access at EL0 */ /* Mask for writable bits */ #define ARMV8_PMU_USERENR_MASK (ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \ ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER) From e1dce56443a4a18978fe39ee4af663e5b6b31422 Mon Sep 17 00:00:00 2001 From: Gowthami Thiagarajan Date: Mon, 28 Oct 2024 11:23:09 +0530 Subject: [PATCH 109/168] perf/marvell: Marvell PEM performance monitor support PCI Express Interface PMU includes various performance counters to monitor the data that is transmitted over the PCIe link. The counters track various inbound and outbound transactions which includes separate counters for posted/non-posted/completion TLPs. Also, inbound and outbound memory read requests along with their latencies can also be monitored. Address Translation Services(ATS)events such as ATS Translation, ATS Page Request, ATS Invalidation along with their corresponding latencies are also supported. The performance counters are 64 bits wide. For instance, perf stat -e ib_tlp_pr tracks the inbound posted TLPs for the workload. Co-developed-by: Linu Cherian Signed-off-by: Linu Cherian Signed-off-by: Gowthami Thiagarajan Link: https://lore.kernel.org/r/20241028055309.17893-1-gthiagarajan@marvell.com Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/index.rst | 1 + .../admin-guide/perf/mrvl-pem-pmu.rst | 56 +++ MAINTAINERS | 6 + drivers/perf/Kconfig | 7 + drivers/perf/Makefile | 1 + drivers/perf/marvell_pem_pmu.c | 425 ++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 7 files changed, 497 insertions(+) create mode 100644 Documentation/admin-guide/perf/mrvl-pem-pmu.rst create mode 100644 drivers/perf/marvell_pem_pmu.c diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 8502bc174640..a58bd3f7e190 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -26,3 +26,4 @@ Performance monitor support meson-ddr-pmu cxl ampere_cspmu + mrvl-pem-pmu diff --git a/Documentation/admin-guide/perf/mrvl-pem-pmu.rst b/Documentation/admin-guide/perf/mrvl-pem-pmu.rst new file mode 100644 index 000000000000..c39007149b97 --- /dev/null +++ b/Documentation/admin-guide/perf/mrvl-pem-pmu.rst @@ -0,0 +1,56 @@ +================================================================= +Marvell Odyssey PEM Performance Monitoring Unit (PMU UNCORE) +================================================================= + +The PCI Express Interface Units(PEM) are associated with a corresponding +monitoring unit. This includes performance counters to track various +characteristics of the data that is transmitted over the PCIe link. + +The counters track inbound and outbound transactions which +includes separate counters for posted/non-posted/completion TLPs. +Also, inbound and outbound memory read requests along with their +latencies can also be monitored. Address Translation Services(ATS)events +such as ATS Translation, ATS Page Request, ATS Invalidation along with +their corresponding latencies are also tracked. + +There are separate 64 bit counters to measure posted/non-posted/completion +tlps in inbound and outbound transactions. ATS events are measured by +different counters. + +The PMU driver exposes the available events and format options under sysfs, +/sys/bus/event_source/devices/mrvl_pcie_rc_pmu_<>/events/ +/sys/bus/event_source/devices/mrvl_pcie_rc_pmu_<>/format/ + +Examples:: + + # perf list | grep mrvl_pcie_rc_pmu + mrvl_pcie_rc_pmu_<>/ats_inv/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ats_inv_latency/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ats_pri/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ats_pri_latency/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ats_trans/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ats_trans_latency/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_inflight/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_reads/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_req_no_ro_ebus/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_req_no_ro_ncb/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_tlp_cpl_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_cpl_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_npr/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_pr/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_tlp_npr/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ib_tlp_pr/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_inflight_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_merges_cpl_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_merges_npr_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_merges_pr_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_reads_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_tlp_cpl_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_cpl_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_npr_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_pr_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_tlp_npr_partid/ [Kernel PMU event] + mrvl_pcie_rc_pmu_<>/ob_tlp_pr_partid/ [Kernel PMU event] + + + # perf stat -e ib_inflight,ib_reads,ib_req_no_ro_ebus,ib_req_no_ro_ncb diff --git a/MAINTAINERS b/MAINTAINERS index 7ad507f49324..aaf2cfbf1d7c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13845,6 +13845,12 @@ S: Supported F: Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst F: drivers/net/ethernet/marvell/octeontx2/af/ +MARVELL PEM PMU DRIVER +M: Linu Cherian +M: Gowthami Thiagarajan +S: Supported +F: drivers/perf/marvell_pem_pmu.c + MARVELL PRESTERA ETHERNET SWITCH DRIVER M: Taras Chornyi S: Supported diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index bab8ba64162f..4e268de351c4 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -284,4 +284,11 @@ config CXL_PMU If unsure say 'm'. +config MARVELL_PEM_PMU + tristate "MARVELL PEM PMU Support" + depends on ARCH_THUNDER || (COMPILE_TEST && 64BIT) + help + Enable support for PCIe Interface performance monitoring + on Marvell platform. + endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 8268f38e42c5..de71d2574857 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o +obj-$(CONFIG_MARVELL_PEM_PMU) += marvell_pem_pmu.o obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o diff --git a/drivers/perf/marvell_pem_pmu.c b/drivers/perf/marvell_pem_pmu.c new file mode 100644 index 000000000000..29fbcd1848e4 --- /dev/null +++ b/drivers/perf/marvell_pem_pmu.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Marvell PEM(PCIe RC) Performance Monitor Driver + * + * Copyright (C) 2024 Marvell. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Each of these events maps to a free running 64 bit counter + * with no event control, but can be reset. + */ +enum pem_events { + IB_TLP_NPR, + IB_TLP_PR, + IB_TLP_CPL, + IB_TLP_DWORDS_NPR, + IB_TLP_DWORDS_PR, + IB_TLP_DWORDS_CPL, + IB_INFLIGHT, + IB_READS, + IB_REQ_NO_RO_NCB, + IB_REQ_NO_RO_EBUS, + OB_TLP_NPR, + OB_TLP_PR, + OB_TLP_CPL, + OB_TLP_DWORDS_NPR, + OB_TLP_DWORDS_PR, + OB_TLP_DWORDS_CPL, + OB_INFLIGHT, + OB_READS, + OB_MERGES_NPR, + OB_MERGES_PR, + OB_MERGES_CPL, + ATS_TRANS, + ATS_TRANS_LATENCY, + ATS_PRI, + ATS_PRI_LATENCY, + ATS_INV, + ATS_INV_LATENCY, + PEM_EVENTIDS_MAX +}; + +static u64 eventid_to_offset_table[] = { + [IB_TLP_NPR] = 0x0, + [IB_TLP_PR] = 0x8, + [IB_TLP_CPL] = 0x10, + [IB_TLP_DWORDS_NPR] = 0x100, + [IB_TLP_DWORDS_PR] = 0x108, + [IB_TLP_DWORDS_CPL] = 0x110, + [IB_INFLIGHT] = 0x200, + [IB_READS] = 0x300, + [IB_REQ_NO_RO_NCB] = 0x400, + [IB_REQ_NO_RO_EBUS] = 0x408, + [OB_TLP_NPR] = 0x500, + [OB_TLP_PR] = 0x508, + [OB_TLP_CPL] = 0x510, + [OB_TLP_DWORDS_NPR] = 0x600, + [OB_TLP_DWORDS_PR] = 0x608, + [OB_TLP_DWORDS_CPL] = 0x610, + [OB_INFLIGHT] = 0x700, + [OB_READS] = 0x800, + [OB_MERGES_NPR] = 0x900, + [OB_MERGES_PR] = 0x908, + [OB_MERGES_CPL] = 0x910, + [ATS_TRANS] = 0x2D18, + [ATS_TRANS_LATENCY] = 0x2D20, + [ATS_PRI] = 0x2D28, + [ATS_PRI_LATENCY] = 0x2D30, + [ATS_INV] = 0x2D38, + [ATS_INV_LATENCY] = 0x2D40, +}; + +struct pem_pmu { + struct pmu pmu; + void __iomem *base; + unsigned int cpu; + struct device *dev; + struct hlist_node node; +}; + +#define to_pem_pmu(p) container_of(p, struct pem_pmu, pmu) + +static int eventid_to_offset(int eventid) +{ + return eventid_to_offset_table[eventid]; +} + +/* Events */ +static ssize_t pem_pmu_event_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); + return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id); +} + +#define PEM_EVENT_ATTR(_name, _id) \ + (&((struct perf_pmu_events_attr[]) { \ + { .attr = __ATTR(_name, 0444, pem_pmu_event_show, NULL), \ + .id = _id, } \ + })[0].attr.attr) + +static struct attribute *pem_perf_events_attrs[] = { + PEM_EVENT_ATTR(ib_tlp_npr, IB_TLP_NPR), + PEM_EVENT_ATTR(ib_tlp_pr, IB_TLP_PR), + PEM_EVENT_ATTR(ib_tlp_cpl_partid, IB_TLP_CPL), + PEM_EVENT_ATTR(ib_tlp_dwords_npr, IB_TLP_DWORDS_NPR), + PEM_EVENT_ATTR(ib_tlp_dwords_pr, IB_TLP_DWORDS_PR), + PEM_EVENT_ATTR(ib_tlp_dwords_cpl_partid, IB_TLP_DWORDS_CPL), + PEM_EVENT_ATTR(ib_inflight, IB_INFLIGHT), + PEM_EVENT_ATTR(ib_reads, IB_READS), + PEM_EVENT_ATTR(ib_req_no_ro_ncb, IB_REQ_NO_RO_NCB), + PEM_EVENT_ATTR(ib_req_no_ro_ebus, IB_REQ_NO_RO_EBUS), + PEM_EVENT_ATTR(ob_tlp_npr_partid, OB_TLP_NPR), + PEM_EVENT_ATTR(ob_tlp_pr_partid, OB_TLP_PR), + PEM_EVENT_ATTR(ob_tlp_cpl_partid, OB_TLP_CPL), + PEM_EVENT_ATTR(ob_tlp_dwords_npr_partid, OB_TLP_DWORDS_NPR), + PEM_EVENT_ATTR(ob_tlp_dwords_pr_partid, OB_TLP_DWORDS_PR), + PEM_EVENT_ATTR(ob_tlp_dwords_cpl_partid, OB_TLP_DWORDS_CPL), + PEM_EVENT_ATTR(ob_inflight_partid, OB_INFLIGHT), + PEM_EVENT_ATTR(ob_reads_partid, OB_READS), + PEM_EVENT_ATTR(ob_merges_npr_partid, OB_MERGES_NPR), + PEM_EVENT_ATTR(ob_merges_pr_partid, OB_MERGES_PR), + PEM_EVENT_ATTR(ob_merges_cpl_partid, OB_MERGES_CPL), + PEM_EVENT_ATTR(ats_trans, ATS_TRANS), + PEM_EVENT_ATTR(ats_trans_latency, ATS_TRANS_LATENCY), + PEM_EVENT_ATTR(ats_pri, ATS_PRI), + PEM_EVENT_ATTR(ats_pri_latency, ATS_PRI_LATENCY), + PEM_EVENT_ATTR(ats_inv, ATS_INV), + PEM_EVENT_ATTR(ats_inv_latency, ATS_INV_LATENCY), + NULL +}; + +static struct attribute_group pem_perf_events_attr_group = { + .name = "events", + .attrs = pem_perf_events_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-5"); + +static struct attribute *pem_perf_format_attrs[] = { + &format_attr_event.attr, + NULL +}; + +static struct attribute_group pem_perf_format_attr_group = { + .name = "format", + .attrs = pem_perf_format_attrs, +}; + +/* cpumask */ +static ssize_t pem_perf_cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pem_pmu *pmu = dev_get_drvdata(dev); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pmu->cpu)); +} + +static struct device_attribute pem_perf_cpumask_attr = + __ATTR(cpumask, 0444, pem_perf_cpumask_show, NULL); + +static struct attribute *pem_perf_cpumask_attrs[] = { + &pem_perf_cpumask_attr.attr, + NULL +}; + +static struct attribute_group pem_perf_cpumask_attr_group = { + .attrs = pem_perf_cpumask_attrs, +}; + +static const struct attribute_group *pem_perf_attr_groups[] = { + &pem_perf_events_attr_group, + &pem_perf_cpumask_attr_group, + &pem_perf_format_attr_group, + NULL +}; + +static int pem_perf_event_init(struct perf_event *event) +{ + struct pem_pmu *pmu = to_pem_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + struct perf_event *sibling; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (event->attr.config >= PEM_EVENTIDS_MAX) + return -EINVAL; + + if (is_sampling_event(event) || + event->attach_state & PERF_ATTACH_TASK) { + return -EOPNOTSUPP; + } + + if (event->cpu < 0) + return -EOPNOTSUPP; + + /* We must NOT create groups containing mixed PMUs */ + if (event->group_leader->pmu != event->pmu && + !is_software_event(event->group_leader)) + return -EINVAL; + + for_each_sibling_event(sibling, event->group_leader) { + if (sibling->pmu != event->pmu && + !is_software_event(sibling)) + return -EINVAL; + } + /* + * Set ownership of event to one CPU, same event can not be observed + * on multiple cpus at same time. + */ + event->cpu = pmu->cpu; + hwc->idx = -1; + return 0; +} + +static u64 pem_perf_read_counter(struct pem_pmu *pmu, + struct perf_event *event, int eventid) +{ + return readq_relaxed(pmu->base + eventid_to_offset(eventid)); +} + +static void pem_perf_event_update(struct perf_event *event) +{ + struct pem_pmu *pmu = to_pem_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u64 prev_count, new_count; + + do { + prev_count = local64_read(&hwc->prev_count); + new_count = pem_perf_read_counter(pmu, event, hwc->idx); + } while (local64_xchg(&hwc->prev_count, new_count) != prev_count); + + local64_add((new_count - prev_count), &event->count); +} + +static void pem_perf_event_start(struct perf_event *event, int flags) +{ + struct pem_pmu *pmu = to_pem_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int eventid = hwc->idx; + + /* + * All counters are free-running and associated with + * a fixed event to track in Hardware + */ + local64_set(&hwc->prev_count, + pem_perf_read_counter(pmu, event, eventid)); + + hwc->state = 0; +} + +static int pem_perf_event_add(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = event->attr.config; + if (WARN_ON_ONCE(hwc->idx >= PEM_EVENTIDS_MAX)) + return -EINVAL; + hwc->state |= PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + pem_perf_event_start(event, flags); + + return 0; +} + +static void pem_perf_event_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (flags & PERF_EF_UPDATE) + pem_perf_event_update(event); + + hwc->state |= PERF_HES_STOPPED; +} + +static void pem_perf_event_del(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + pem_perf_event_stop(event, PERF_EF_UPDATE); + hwc->idx = -1; +} + +static int pem_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct pem_pmu *pmu = hlist_entry_safe(node, struct pem_pmu, node); + unsigned int target; + + if (cpu != pmu->cpu) + return 0; + + target = cpumask_any_but(cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + return 0; + + perf_pmu_migrate_context(&pmu->pmu, cpu, target); + pmu->cpu = target; + return 0; +} + +static int pem_perf_probe(struct platform_device *pdev) +{ + struct pem_pmu *pem_pmu; + struct resource *res; + void __iomem *base; + char *name; + int ret; + + pem_pmu = devm_kzalloc(&pdev->dev, sizeof(*pem_pmu), GFP_KERNEL); + if (!pem_pmu) + return -ENOMEM; + + pem_pmu->dev = &pdev->dev; + platform_set_drvdata(pdev, pem_pmu); + + base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); + if (IS_ERR(base)) + return PTR_ERR(base); + + pem_pmu->base = base; + + pem_pmu->pmu = (struct pmu) { + .module = THIS_MODULE, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .task_ctx_nr = perf_invalid_context, + .attr_groups = pem_perf_attr_groups, + .event_init = pem_perf_event_init, + .add = pem_perf_event_add, + .del = pem_perf_event_del, + .start = pem_perf_event_start, + .stop = pem_perf_event_stop, + .read = pem_perf_event_update, + }; + + /* Choose this cpu to collect perf data */ + pem_pmu->cpu = raw_smp_processor_id(); + + name = devm_kasprintf(pem_pmu->dev, GFP_KERNEL, "mrvl_pcie_rc_pmu_%llx", + res->start); + if (!name) + return -ENOMEM; + + cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE, + &pem_pmu->node); + + ret = perf_pmu_register(&pem_pmu->pmu, name, -1); + if (ret) + goto error; + + return 0; +error: + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE, + &pem_pmu->node); + return ret; +} + +static void pem_perf_remove(struct platform_device *pdev) +{ + struct pem_pmu *pem_pmu = platform_get_drvdata(pdev); + + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE, + &pem_pmu->node); + + perf_pmu_unregister(&pem_pmu->pmu); +} + +#ifdef CONFIG_ACPI +static const struct acpi_device_id pem_pmu_acpi_match[] = { + {"MRVL000E", 0}, + {} +}; +MODULE_DEVICE_TABLE(acpi, pem_pmu_acpi_match); +#endif + +static struct platform_driver pem_pmu_driver = { + .driver = { + .name = "pem-pmu", + .acpi_match_table = ACPI_PTR(pem_pmu_acpi_match), + .suppress_bind_attrs = true, + }, + .probe = pem_perf_probe, + .remove = pem_perf_remove, +}; + +static int __init pem_pmu_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE, + "perf/marvell/pem:online", NULL, + pem_pmu_offline_cpu); + if (ret) + return ret; + + ret = platform_driver_register(&pem_pmu_driver); + if (ret) + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE); + return ret; +} + +static void __exit pem_pmu_exit(void) +{ + platform_driver_unregister(&pem_pmu_driver); + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE); +} + +module_init(pem_pmu_init); +module_exit(pem_pmu_exit); + +MODULE_DESCRIPTION("Marvell PEM Perf driver"); +MODULE_AUTHOR("Gowthami Thiagarajan "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2361ed4d2b15..61d9a66d1807 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -227,6 +227,7 @@ enum cpuhp_state { CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE, CPUHP_AP_PERF_ARM_CAVIUM_TX2_UNCORE_ONLINE, CPUHP_AP_PERF_ARM_MARVELL_CN10K_DDR_ONLINE, + CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE, CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, From bdc9a64c8b2041edfbcf02d07ee7402df65918e9 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 29 Oct 2024 07:06:02 -0500 Subject: [PATCH 110/168] ARM: pmuv3: Add missing write_pmuacr() Fix compilation on Arm by adding missing static inline write_pmuacr() declaration. Fixes: 0bbff9ed8165 ("perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410291954.NiHLIwSC-lkp@intel.com/ Signed-off-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20241029120602.4061566-2-robh@kernel.org Signed-off-by: Will Deacon --- arch/arm/include/asm/arm_pmuv3.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index d242b5e1ca0d..2ec0e5e83fc9 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -212,6 +212,8 @@ static inline void write_pmuserenr(u32 val) write_sysreg(val, PMUSERENR); } +static inline void write_pmuacr(u64 val) {} + static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) From 83d511c3ca0cb70a55d8b0ae3e753448fb00272b Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Tue, 8 Oct 2024 23:18:22 +0000 Subject: [PATCH 111/168] perf/dwc_pcie: Add support for Ampere SoCs Add support for Ampere SoCs by adding Ampere's vendor ID to the vendor list. Signed-off-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20241008231824.5102-2-ilkka@os.amperecomputing.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index 126d2c3516ad..edb5a809c928 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -106,6 +106,7 @@ struct dwc_pcie_vendor_id { static const struct dwc_pcie_vendor_id dwc_pcie_vendor_ids[] = { {.vendor_id = PCI_VENDOR_ID_ALIBABA }, + {.vendor_id = PCI_VENDOR_ID_AMPERE }, {.vendor_id = PCI_VENDOR_ID_QCOM }, {} /* terminator */ }; From 94b3ad10c2e1d0e761756a844b78a21101dd1810 Mon Sep 17 00:00:00 2001 From: Ilkka Koskinen Date: Tue, 8 Oct 2024 23:18:24 +0000 Subject: [PATCH 112/168] perf/dwc_pcie: Fix typos in event names Fix a few typos in event names Signed-off-by: Ilkka Koskinen Reviewed-by: Jing Zhang Reviewed-by: Shuai Xue Link: https://lore.kernel.org/r/20241008231824.5102-4-ilkka@os.amperecomputing.com Signed-off-by: Will Deacon --- drivers/perf/dwc_pcie_pmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c index edb5a809c928..9cbea9675e21 100644 --- a/drivers/perf/dwc_pcie_pmu.c +++ b/drivers/perf/dwc_pcie_pmu.c @@ -216,9 +216,9 @@ static struct attribute *dwc_pcie_pmu_time_event_attrs[] = { DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_update_fc_dllp, 0x601), DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ack_dllp, 0x602), DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_update_fc_dllp, 0x603), - DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nulified_tlp, 0x604), - DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nulified_tlp, 0x605), - DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tl, 0x606), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nullified_tlp, 0x604), + DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nullified_tlp, 0x605), + DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tlp, 0x606), DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_write, 0x700), DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_read, 0x701), DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_write, 0x702), From 3930c88ad0a5a8538ef8a281d4f4c230fb4d4fc4 Mon Sep 17 00:00:00 2001 From: Markuss Broks Date: Sat, 26 Oct 2024 23:28:08 +0300 Subject: [PATCH 113/168] dt-bindings: arm: pmu: Add Samsung Mongoose core compatible Add the compatible for the Samsung Mongoose CPU PMU to the schema. Co-developed-by: Maksym Holovach Signed-off-by: Maksym Holovach Signed-off-by: Markuss Broks Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241026-mongoose-pmu-v1-1-f1a7448054be@gmail.com Signed-off-by: Will Deacon --- Documentation/devicetree/bindings/arm/pmu.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/pmu.yaml b/Documentation/devicetree/bindings/arm/pmu.yaml index 528544d0a161..a148ff54f2b8 100644 --- a/Documentation/devicetree/bindings/arm/pmu.yaml +++ b/Documentation/devicetree/bindings/arm/pmu.yaml @@ -74,6 +74,7 @@ properties: - qcom,krait-pmu - qcom,scorpion-pmu - qcom,scorpion-mp-pmu + - samsung,mongoose-pmu interrupts: # Don't know how many CPUs, so no constraints to specify From 9643aaa194737b1ad73f9da6e51fc63d1c7a864a Mon Sep 17 00:00:00 2001 From: Markuss Broks Date: Sat, 26 Oct 2024 23:28:09 +0300 Subject: [PATCH 114/168] perf: arm_pmuv3: Add support for Samsung Mongoose PMU Add support for the Samsung Mongoose CPU core PMU. This just adds the names and links to DT compatible strings. Co-developed-by: Maksym Holovach Signed-off-by: Maksym Holovach Signed-off-by: Markuss Broks Link: https://lore.kernel.org/r/20241026-mongoose-pmu-v1-2-f1a7448054be@gmail.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmuv3.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index bb93d32b86ea..b5cc11abc962 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -1373,6 +1373,8 @@ PMUV3_INIT_SIMPLE(armv8_neoverse_v3ae) PMUV3_INIT_SIMPLE(armv8_nvidia_carmel) PMUV3_INIT_SIMPLE(armv8_nvidia_denver) +PMUV3_INIT_SIMPLE(armv8_samsung_mongoose) + PMUV3_INIT_MAP_EVENT(armv8_cortex_a35, armv8_a53_map_event) PMUV3_INIT_MAP_EVENT(armv8_cortex_a53, armv8_a53_map_event) PMUV3_INIT_MAP_EVENT(armv8_cortex_a57, armv8_a57_map_event) @@ -1418,6 +1420,7 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = { {.compatible = "brcm,vulcan-pmu", .data = armv8_brcm_vulcan_pmu_init}, {.compatible = "nvidia,carmel-pmu", .data = armv8_nvidia_carmel_pmu_init}, {.compatible = "nvidia,denver-pmu", .data = armv8_nvidia_denver_pmu_init}, + {.compatible = "samsung,mongoose-pmu", .data = armv8_samsung_mongoose_pmu_init}, {}, }; From 2cfdb799dc7681a93844e5019f9bbff603c2c9ee Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Mon, 28 Oct 2024 18:57:21 +0000 Subject: [PATCH 115/168] arm64: mops: Document requirements for hypervisors Add a mops.rst document to clarify in more detail what hypervisors need to do to run a Linux guest on a system with FEAT_MOPS. Signed-off-by: Kristina Martsenko Link: https://lore.kernel.org/r/20241028185721.52852-1-kristina.martsenko@arm.com Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/booting.rst | 4 +-- Documentation/arch/arm64/index.rst | 1 + Documentation/arch/arm64/mops.rst | 44 ++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 2 deletions(-) create mode 100644 Documentation/arch/arm64/mops.rst diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index db46af5b9f0f..dabd279dee5d 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -385,8 +385,8 @@ Before jumping into the kernel, the following conditions must be met: - HCRX_EL2.MSCEn (bit 11) must be initialised to 0b1. - - HCRX_EL2.MCE2 (bit 10) must be initialised to 0b1. The exception - handler must set PSTATE.SS to 0b0. + - HCRX_EL2.MCE2 (bit 10) must be initialised to 0b1 and the hypervisor + must handle MOPS exceptions as described in :ref:`arm64_mops_hyp`. For CPUs with the Extended Translation Control Register feature (FEAT_TCR2): diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index 78544de0a8a9..463de5855e84 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -20,6 +20,7 @@ ARM64 Architecture legacy_instructions memory memory-tagging-extension + mops perf pointer-authentication ptdump diff --git a/Documentation/arch/arm64/mops.rst b/Documentation/arch/arm64/mops.rst new file mode 100644 index 000000000000..2ef5b147f8dc --- /dev/null +++ b/Documentation/arch/arm64/mops.rst @@ -0,0 +1,44 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== +Memory copy/set instructions (MOPS) +=================================== + +A MOPS memory copy/set operation consists of three consecutive CPY* or SET* +instructions: a prologue, main and epilogue (for example: CPYP, CPYM, CPYE). + +A main or epilogue instruction can take a MOPS exception for various reasons, +for example when a task is migrated to a CPU with a different MOPS +implementation, or when the instruction's alignment and size requirements are +not met. The software exception handler is then expected to reset the registers +and restart execution from the prologue instruction. Normally this is handled +by the kernel. + +For more details refer to "D1.3.5.7 Memory Copy and Memory Set exceptions" in +the Arm Architecture Reference Manual DDI 0487K.a (Arm ARM). + +.. _arm64_mops_hyp: + +Hypervisor requirements +----------------------- + +A hypervisor running a Linux guest must handle all MOPS exceptions from the +guest kernel, as Linux may not be able to handle the exception at all times. +For example, a MOPS exception can be taken when the hypervisor migrates a vCPU +to another physical CPU with a different MOPS implementation. + +To do this, the hypervisor must: + + - Set HCRX_EL2.MCE2 to 1 so that the exception is taken to the hypervisor. + + - Have an exception handler that implements the algorithm from the Arm ARM + rules CNTMJ and MWFQH. + + - Set the guest's PSTATE.SS to 0 in the exception handler, to handle a + potential step of the current instruction. + + Note: Clearing PSTATE.SS is needed so that a single step exception is taken + on the next instruction (the prologue instruction). Otherwise prologue + would get silently stepped over and the single step exception taken on the + main instruction. Note that if the guest instruction is not being stepped + then clearing PSTATE.SS has no effect. From f8192813dcbe6d27e7031d9d353ea1fcc67052fd Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 29 Oct 2024 10:15:29 +0530 Subject: [PATCH 116/168] arm64/mm: Re-organize arch_make_huge_pte() Core HugeTLB defines a fallback definition for arch_make_huge_pte(), which calls platform provided pte_mkhuge(). But if any platform already provides an override for arch_make_huge_pte(), then it does not need to provide the helper pte_mkhuge(). arm64 override for arch_make_huge_pte() calls pte_mkhuge() internally, thus creating an impression, that both of these callbacks are being used in core HugeTLB and hence required to be defined. This drops off pte_mkhuge() which was never required to begin with as there could not be any section mappings at the PTE level. Re-organize arch_make_huge_pte() based on requested page size and create the entry for the applicable page table level as needed. It also removes a redundancy of clearing PTE_TABLE_BIT bit followed by setting both PTE_TABLE_BIT and PTE_VALID bits (via PTE_TYPE_MASK) in the pte, while creating CONT_PTE_SIZE size entries. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241029044529.2624785-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 5 ----- arch/arm64/mm/hugetlbpage.c | 21 ++++++++++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d56dbe5742d6..609fd4447b19 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -438,11 +438,6 @@ static inline void __set_ptes(struct mm_struct *mm, } } -/* - * Huge pte definitions. - */ -#define pte_mkhuge(pte) (__pte(pte_val(pte) & ~PTE_TABLE_BIT)) - /* * Hugetlb definitions. */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 5f1e2103888b..3215adf48a1b 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -361,14 +361,25 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { size_t pagesize = 1UL << shift; - entry = pte_mkhuge(entry); - if (pagesize == CONT_PTE_SIZE) { - entry = pte_mkcont(entry); - } else if (pagesize == CONT_PMD_SIZE) { + switch (pagesize) { +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + entry = pud_pte(pud_mkhuge(pte_pud(entry))); + break; +#endif + case CONT_PMD_SIZE: entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); - } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) { + fallthrough; + case PMD_SIZE: + entry = pmd_pte(pmd_mkhuge(pte_pmd(entry))); + break; + case CONT_PTE_SIZE: + entry = pte_mkcont(entry); + break; + default: pr_warn("%s: unrecognized huge page size 0x%lx\n", __func__, pagesize); + break; } return entry; } From 9a0e3b92b02e7a921b22f4d00d87e3506d06b92a Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Thu, 24 Oct 2024 03:41:20 +0000 Subject: [PATCH 117/168] arm64: Return early when break handler is found on linked-list The search for breakpoint handlers iterate through the entire linked list. Given that all registered hook has a valid fn field, and no registered hooks share the same mask and imm. This commit optimize the efficiency slightly by returning early as a matching handler is found. Signed-off-by: Liao Chang Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241024034120.3814224-1-liaochang1@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/debug-monitors.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 024a7b245056..4713a4c65b1b 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -303,7 +303,6 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr) { struct break_hook *hook; struct list_head *list; - int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL; list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; @@ -313,10 +312,10 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr) */ list_for_each_entry_rcu(hook, list, node) { if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm) - fn = hook->fn; + return hook->fn(regs, esr); } - return fn ? fn(regs, esr) : DBG_HOOK_ERROR; + return DBG_HOOK_ERROR; } NOKPROBE_SYMBOL(call_break_hook); From 17a2409783f141c9fcd03b140d17bbb75e98c630 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 29 Oct 2024 12:34:21 +0000 Subject: [PATCH 118/168] kselftest/arm64: Use ksft_perror() to log MTE failures The logging in the allocation helpers variously uses ksft_print_msg() with very intermittent logging of errno and perror() (which won't produce KTAP conformant output) when logging the result of API calls that set errno. Standardise on using the ksft_perror() helper in these cases so that more information is available should the tests fail. Signed-off-by: Mark Brown Acked-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20241029-arm64-mte-test-logging-v1-1-a128e732e36e@kernel.org Signed-off-by: Catalin Marinas --- .../testing/selftests/arm64/mte/mte_common_util.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c index 17fbe5cfe472..a1dc2fe5285b 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.c +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -150,13 +150,13 @@ static void *__mte_allocate_memory_range(size_t size, int mem_type, int mapping, map_flag |= MAP_PRIVATE; ptr = mmap(NULL, entire_size, prot_flag, map_flag, fd, 0); if (ptr == MAP_FAILED) { - ksft_print_msg("FAIL: mmap allocation\n"); + ksft_perror("mmap()"); return NULL; } if (mem_type == USE_MPROTECT) { if (mprotect(ptr, entire_size, prot_flag | PROT_MTE)) { + ksft_perror("mprotect(PROT_MTE)"); munmap(ptr, size); - ksft_print_msg("FAIL: mprotect PROT_MTE property\n"); return NULL; } } @@ -190,13 +190,13 @@ void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, bool tags lseek(fd, 0, SEEK_SET); for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) { if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) { - perror("initialising buffer"); + ksft_perror("initialising buffer"); return NULL; } } index -= INIT_BUFFER_SIZE; if (write(fd, buffer, size - index) != size - index) { - perror("initialising buffer"); + ksft_perror("initialising buffer"); return NULL; } return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, fd); @@ -217,12 +217,12 @@ void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping, lseek(fd, 0, SEEK_SET); for (index = INIT_BUFFER_SIZE; index < map_size; index += INIT_BUFFER_SIZE) if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) { - perror("initialising buffer"); + ksft_perror("initialising buffer"); return NULL; } index -= INIT_BUFFER_SIZE; if (write(fd, buffer, map_size - index) != map_size - index) { - perror("initialising buffer"); + ksft_perror("initialising buffer"); return NULL; } return __mte_allocate_memory_range(size, mem_type, mapping, range_before, @@ -358,7 +358,7 @@ int create_temp_file(void) /* Create a file in the tmpfs filesystem */ fd = mkstemp(&filename[0]); if (fd == -1) { - perror(filename); + ksft_perror(filename); ksft_print_msg("FAIL: Unable to open temporary file\n"); return 0; } From 1caeda5ef2510e8af57b75edb74bd1daa6364c1e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 31 Oct 2024 19:21:38 +0000 Subject: [PATCH 119/168] arm64/gcs: Fix outdated ptrace documentation The ptrace documentation for GCS was written prior to the implementation of clone3() when we still blocked enabling of GCS via ptrace. This restriction was relaxed as part of implementing clone3() support since we implemented support for the GCS not being managed by the kernel but the documentation still mentions the restriction. Update the documentation to reflect what was merged. We have not yet merged clone3() itself but all the support other than in clone() itself is there. Fixes: 7058bf87cd59 ("arm64/gcs: Document the ABI for Guarded Control Stacks") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241031-arm64-gcs-doc-disable-v1-1-d7f6ded62046@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/gcs.rst | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst index af58d9151cb7..1f65a3193e77 100644 --- a/Documentation/arch/arm64/gcs.rst +++ b/Documentation/arch/arm64/gcs.rst @@ -204,11 +204,8 @@ When returning from a signal handler: * A new regset NT_ARM_GCS is defined for use with PTRACE_GETREGSET and PTRACE_SETREGSET. -* Due to the complexity surrounding allocation and deallocation of stacks and - lack of practical application it is not possible to enable GCS via ptrace. - GCS may be disabled via the ptrace interface. - -* Other GCS modes may be configured via ptrace. +* The GCS mode, including enable and disable, may be configured via ptrace. + If GCS is enabled via ptrace no new GCS will be allocated for the thread. * Configuration via ptrace ignores locking of GCS mode bits. From 2287a4c1e11822d05a70d22f28b26bd810dd204e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 31 Oct 2024 08:35:19 +0000 Subject: [PATCH 120/168] arm64: Expose ID_AA64ISAR1_EL1.XS to sanitised feature consumers Despite KVM now being able to deal with XS-tagged TLBIs, we still don't expose these feature bits to KVM. Plumb in the feature in ID_AA64ISAR1_EL1. Fixes: 0feec7769a63 ("KVM: arm64: nv: Add handling of NXS-flavoured TLBI operations") Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20241031083519.364313-1-maz@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..db994d1fd97e 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -228,6 +228,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0), From 69c0d824779843b51ca2339b2163db4d3b40c54c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 28 Oct 2024 20:22:31 +0000 Subject: [PATCH 121/168] kselftest/arm64: Fix encoding for SVE B16B16 test The test for SVE_B16B16 had a cut'n'paste of a SME instruction, fix it with a relevant SVE instruction. Fixes: 44d10c27bd75 ("kselftest/arm64: Add 2023 DPISA hwcap test coverage") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241028-arm64-b16b16-test-v1-1-59a4a7449bdf@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/abi/hwcap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c index 7e95ba5fd496..265654ec48b9 100644 --- a/tools/testing/selftests/arm64/abi/hwcap.c +++ b/tools/testing/selftests/arm64/abi/hwcap.c @@ -361,8 +361,8 @@ static void sveaes_sigill(void) static void sveb16b16_sigill(void) { - /* BFADD ZA.H[W0, 0], {Z0.H-Z1.H} */ - asm volatile(".inst 0xC1E41C00" : : : ); + /* BFADD Z0.H, Z0.H, Z0.H */ + asm volatile(".inst 0x65000000" : : : ); } static void svepmull_sigill(void) From 525fd6a1b34ea8ce42e12db38380eed0efefb5d5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 2 Nov 2024 10:31:54 +0100 Subject: [PATCH 122/168] arm64/fpsimd: Fix a typo s/FPSMID/FPSIMD/ M and I swapped. Fix it. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/2cbcb42615e9265bccc9b746465d7998382e605d.1730539907.git.christophe.jaillet@wanadoo.fr Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 77006df20a75..cd7d71fe1fda 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -386,7 +386,7 @@ static void task_fpsimd_load(void) * fpsimd_save_user_state() or memory corruption, we * should always record an explicit format * when we save. We always at least have the - * memory allocated for FPSMID registers so + * memory allocated for FPSIMD registers so * try that and hope for the best. */ WARN_ON_ONCE(1); From 263e22d6bd1f8a12dc2e770cf190f8dfb31f867e Mon Sep 17 00:00:00 2001 From: Zheng Zengkai Date: Wed, 16 Oct 2024 17:54:58 +0800 Subject: [PATCH 123/168] ACPI: GTDT: Tighten the check for the array of platform timer structures As suggested by Marc and Lorenzo, first we need to check whether the platform_timer entry pointer is within gtdt bounds (< gtdt_end) before de-referencing what it points at to detect the length of the platform timer struct and then check that the length of current platform_timer struct is also valid, i.e. the length is not zero and within gtdt_end. Now next_platform_timer() only checks against gtdt_end for the entry of subsequent platform timer without checking the length of it and will not report error if the check failed and the existing check in function acpi_gtdt_init() is also not enough. Modify the for_each_platform_timer() iterator and use it combined with a dedicated check function platform_timer_valid() to do the check against table length (gtdt_end) for each element of platform timer array in function acpi_gtdt_init(), making sure that both their entry and length actually fit in the table. Suggested-by: Lorenzo Pieralisi Co-developed-by: Marc Zyngier Signed-off-by: Marc Zyngier Signed-off-by: Zheng Zengkai Reviewed-by: Lorenzo Pieralisi Reviewed-by: Hanjun Guo Tested-by: Hanjun Guo Link: https://lore.kernel.org/r/20241016095458.34126-1-zhengzengkai@huawei.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/gtdt.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index c0e77c1c8e09..d7c4e1b9915b 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -36,19 +36,25 @@ struct acpi_gtdt_descriptor { static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata; -static inline __init void *next_platform_timer(void *platform_timer) +static __init bool platform_timer_valid(void *platform_timer) { struct acpi_gtdt_header *gh = platform_timer; - platform_timer += gh->length; - if (platform_timer < acpi_gtdt_desc.gtdt_end) - return platform_timer; + return (platform_timer >= (void *)(acpi_gtdt_desc.gtdt + 1) && + platform_timer < acpi_gtdt_desc.gtdt_end && + gh->length != 0 && + platform_timer + gh->length <= acpi_gtdt_desc.gtdt_end); +} - return NULL; +static __init void *next_platform_timer(void *platform_timer) +{ + struct acpi_gtdt_header *gh = platform_timer; + + return platform_timer + gh->length; } #define for_each_platform_timer(_g) \ - for (_g = acpi_gtdt_desc.platform_timer; _g; \ + for (_g = acpi_gtdt_desc.platform_timer; platform_timer_valid(_g);\ _g = next_platform_timer(_g)) static inline bool is_timer_block(void *platform_timer) @@ -157,6 +163,7 @@ int __init acpi_gtdt_init(struct acpi_table_header *table, { void *platform_timer; struct acpi_table_gtdt *gtdt; + int cnt = 0; gtdt = container_of(table, struct acpi_table_gtdt, header); acpi_gtdt_desc.gtdt = gtdt; @@ -176,12 +183,16 @@ int __init acpi_gtdt_init(struct acpi_table_header *table, return 0; } - platform_timer = (void *)gtdt + gtdt->platform_timer_offset; - if (platform_timer < (void *)table + sizeof(struct acpi_table_gtdt)) { + acpi_gtdt_desc.platform_timer = (void *)gtdt + gtdt->platform_timer_offset; + for_each_platform_timer(platform_timer) + cnt++; + + if (cnt != gtdt->platform_timer_count) { + acpi_gtdt_desc.platform_timer = NULL; pr_err(FW_BUG "invalid timer data.\n"); return -EINVAL; } - acpi_gtdt_desc.platform_timer = platform_timer; + if (platform_timer_count) *platform_timer_count = gtdt->platform_timer_count; From ced841702ee7013ef08159a2cb5cadf0e0820b13 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 4 Nov 2024 09:46:17 +0530 Subject: [PATCH 124/168] arm64/mm: Drop setting PTE_TYPE_PAGE in pte_mkcont() PTE_TYPE_PAGE bits were being set in pte_mkcont() because PTE_TABLE_BIT was being cleared in pte_mkhuge(). But after arch_make_huge_pte() modification in commit f8192813dcbe ("arm64/mm: Re-organize arch_make_huge_pte()"), which dropped pte_mkhuge() completely, setting back PTE_TYPE_PAGE bits is no longer necessary. Change pte_mkcont() to only set PTE_CONT. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241104041617.3804617-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 609fd4447b19..0569b2308fd1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -265,8 +265,7 @@ static inline pte_t pte_mkspecial(pte_t pte) static inline pte_t pte_mkcont(pte_t pte) { - pte = set_pte_bit(pte, __pgprot(PTE_CONT)); - return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE)); + return set_pte_bit(pte, __pgprot(PTE_CONT)); } static inline pte_t pte_mknoncont(pte_t pte) From 466ece4c6e195263ccab2d402ee06b2f8d639a0e Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 29 Oct 2024 14:45:36 +0000 Subject: [PATCH 125/168] arm64: signal: Remove unnecessary check when saving POE state The POE frame record is allocated unconditionally if POE is supported. If the allocation fails, a SIGSEGV is delivered before setup_sigframe() can be reached. As a result there is no need to consider poe_offset before saving POR_EL0; just remove that check. This is in line with other frame records (FPMR, TPIDR2). Reviewed-by: Mark Brown Reviewed-by: Dave Martin Acked-by: Catalin Marinas Signed-off-by: Kevin Brodsky Link: https://lore.kernel.org/r/20241029144539.111155-3-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index c7d311d8b92a..d5eb517cc4df 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -1154,7 +1154,7 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, err |= preserve_fpmr_context(fpmr_ctx); } - if (system_supports_poe() && err == 0 && user->poe_offset) { + if (system_supports_poe() && err == 0) { struct poe_context __user *poe_ctx = apply_user_offset(user, user->poe_offset); From 8edbbfcc1ed3ca2170a2c5e888a76ba3652e62c9 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 29 Oct 2024 14:45:37 +0000 Subject: [PATCH 126/168] arm64: signal: Remove unused macro Commit 33f082614c34 ("arm64: signal: Allow expansion of the signal frame") introduced the BASE_SIGFRAME_SIZE macro but it has apparently never been used; just remove it. Reviewed-by: Dave Martin Acked-by: Catalin Marinas Signed-off-by: Kevin Brodsky Link: https://lore.kernel.org/r/20241029144539.111155-4-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/signal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index d5eb517cc4df..b077181a66c0 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -79,7 +79,6 @@ struct user_access_state { u64 por_el0; }; -#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16) #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) From 6e182dc9f2680681ffb0b6d9757927f1bd321b38 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 29 Oct 2024 14:45:38 +0000 Subject: [PATCH 127/168] selftests/mm: Use generic pkey register manipulation pkey_sighandler_tests.c currently hardcodes x86 PKRU encodings. The first step towards running those tests on arm64 is to abstract away the pkey register values. Since those tests want to deny access to all keys except a few, we have each arch define PKEY_REG_ALLOW_NONE, the pkey register value denying access to all keys. We then use the existing set_pkey_bits() helper to grant access to specific keys. Because pkeys may also remove the execute permission on arm64, we need to be a little careful: all code is mapped with pkey 0, and we need it to remain executable. pkey_reg_restrictive_default() is introduced for that purpose: the value it returns prevents RW access to all pkeys, but retains X permission for pkey 0. test_pkru_preserved_after_sigusr1() only checks that the pkey register value remains unchanged after a signal is delivered, so the particular value is irrelevant. We enable pkey 0 and a few more arbitrary keys in the smallest range available on all architectures (8 keys on arm64). Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241029144539.111155-5-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/mm/pkey-arm64.h | 1 + tools/testing/selftests/mm/pkey-x86.h | 2 + .../selftests/mm/pkey_sighandler_tests.c | 53 +++++++++++++++---- 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h index 580e1b0bb38e..d57fbeace38f 100644 --- a/tools/testing/selftests/mm/pkey-arm64.h +++ b/tools/testing/selftests/mm/pkey-arm64.h @@ -31,6 +31,7 @@ #define NR_RESERVED_PKEYS 1 /* pkey-0 */ #define PKEY_ALLOW_ALL 0x77777777 +#define PKEY_REG_ALLOW_NONE 0x0 #define PKEY_BITS_PER_PKEY 4 #define PAGE_SIZE sysconf(_SC_PAGESIZE) diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h index 5f28e26a2511..ac91777c8917 100644 --- a/tools/testing/selftests/mm/pkey-x86.h +++ b/tools/testing/selftests/mm/pkey-x86.h @@ -34,6 +34,8 @@ #define PAGE_SIZE 4096 #define MB (1<<20) +#define PKEY_REG_ALLOW_NONE 0x55555555 + static inline void __page_o_noops(void) { /* 8-bytes of instruction * 512 bytes = 1 page */ diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c index a8088b645ad6..501880dbdc37 100644 --- a/tools/testing/selftests/mm/pkey_sighandler_tests.c +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -11,6 +11,7 @@ */ #define _GNU_SOURCE #define __SANE_USERSPACE_TYPES__ +#include #include #include #include @@ -65,6 +66,20 @@ long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6) return ret; } +/* + * Returns the most restrictive pkey register value that can be used by the + * tests. + */ +static inline u64 pkey_reg_restrictive_default(void) +{ + /* + * Disallow everything except execution on pkey 0, so that each caller + * doesn't need to enable it explicitly (the selftest code runs with + * its code mapped with pkey 0). + */ + return set_pkey_bits(PKEY_REG_ALLOW_NONE, 0, PKEY_DISABLE_ACCESS); +} + static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext) { pthread_mutex_lock(&mutex); @@ -113,7 +128,7 @@ static void raise_sigusr2(void) static void *thread_segv_with_pkey0_disabled(void *ptr) { /* Disable MPK 0 (and all others too) */ - __write_pkey_reg(0x55555555); + __write_pkey_reg(pkey_reg_restrictive_default()); /* Segfault (with SEGV_MAPERR) */ *(int *) (0x1) = 1; @@ -123,7 +138,7 @@ static void *thread_segv_with_pkey0_disabled(void *ptr) static void *thread_segv_pkuerr_stack(void *ptr) { /* Disable MPK 0 (and all others too) */ - __write_pkey_reg(0x55555555); + __write_pkey_reg(pkey_reg_restrictive_default()); /* After we disable MPK 0, we can't access the stack to return */ return NULL; @@ -133,6 +148,7 @@ static void *thread_segv_maperr_ptr(void *ptr) { stack_t *stack = ptr; int *bad = (int *)1; + u64 pkey_reg; /* * Setup alternate signal stack, which should be pkey_mprotect()ed by @@ -142,7 +158,9 @@ static void *thread_segv_maperr_ptr(void *ptr) syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); /* Disable MPK 0. Only MPK 1 is enabled. */ - __write_pkey_reg(0x55555551); + pkey_reg = pkey_reg_restrictive_default(); + pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED); + __write_pkey_reg(pkey_reg); /* Segfault */ *bad = 1; @@ -240,6 +258,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) int pkey; int parent_pid = 0; int child_pid = 0; + u64 pkey_reg; sa.sa_flags = SA_SIGINFO | SA_ONSTACK; @@ -257,7 +276,10 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) assert(stack != MAP_FAILED); /* Allow access to MPK 0 and MPK 1 */ - __write_pkey_reg(0x55555550); + pkey_reg = pkey_reg_restrictive_default(); + pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED); + pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED); + __write_pkey_reg(pkey_reg); /* Protect the new stack with MPK 1 */ pkey = pkey_alloc(0, 0); @@ -307,7 +329,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) static void test_pkru_preserved_after_sigusr1(void) { struct sigaction sa; - unsigned long pkru = 0x45454544; + u64 pkey_reg; + + /* Allow access to MPK 0 and an arbitrary set of keys */ + pkey_reg = pkey_reg_restrictive_default(); + pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED); + pkey_reg = set_pkey_bits(pkey_reg, 3, PKEY_UNRESTRICTED); + pkey_reg = set_pkey_bits(pkey_reg, 7, PKEY_UNRESTRICTED); sa.sa_flags = SA_SIGINFO; @@ -320,7 +348,7 @@ static void test_pkru_preserved_after_sigusr1(void) memset(&siginfo, 0, sizeof(siginfo)); - __write_pkey_reg(pkru); + __write_pkey_reg(pkey_reg); raise(SIGUSR1); @@ -330,7 +358,7 @@ static void test_pkru_preserved_after_sigusr1(void) pthread_mutex_unlock(&mutex); /* Ensure the pkru value is the same after returning from signal. */ - ksft_test_result(pkru == __read_pkey_reg() && + ksft_test_result(pkey_reg == __read_pkey_reg() && siginfo.si_signo == SIGUSR1, "%s\n", __func__); } @@ -347,6 +375,7 @@ static noinline void *thread_sigusr2_self(void *ptr) 'S', 'I', 'G', 'U', 'S', 'R', '2', '.', '.', '.', '\n', '\0'}; stack_t *stack = ptr; + u64 pkey_reg; /* * Setup alternate signal stack, which should be pkey_mprotect()ed by @@ -356,7 +385,9 @@ static noinline void *thread_sigusr2_self(void *ptr) syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0); /* Disable MPK 0. Only MPK 2 is enabled. */ - __write_pkey_reg(0x55555545); + pkey_reg = pkey_reg_restrictive_default(); + pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED); + __write_pkey_reg(pkey_reg); raise_sigusr2(); @@ -384,6 +415,7 @@ static void test_pkru_sigreturn(void) int pkey; int parent_pid = 0; int child_pid = 0; + u64 pkey_reg; sa.sa_handler = SIG_DFL; sa.sa_flags = 0; @@ -418,7 +450,10 @@ static void test_pkru_sigreturn(void) * the current thread's stack is protected by the default MPK 0. Hence * both need to be enabled. */ - __write_pkey_reg(0x55555544); + pkey_reg = pkey_reg_restrictive_default(); + pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED); + pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED); + __write_pkey_reg(pkey_reg); /* Protect the stack with MPK 2 */ pkey = pkey_alloc(0, 0); From 49f59573e9e06093ba23caf4ea1641b16e7e497e Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 29 Oct 2024 14:45:39 +0000 Subject: [PATCH 128/168] selftests/mm: Enable pkey_sighandler_tests on arm64 pkey_sighandler_tests.c makes raw syscalls using its own helper, syscall_raw(). One of those syscalls is clone, which is problematic as every architecture has a different opinion on the order of its arguments. To complete arm64 support, we therefore add an appropriate implementation in syscall_raw(), and introduce a clone_raw() helper that shuffles arguments as needed for each arch. Having done this, we enable building pkey_sighandler_tests for arm64 in the Makefile. Signed-off-by: Kevin Brodsky Link: https://lore.kernel.org/r/20241029144539.111155-6-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/mm/Makefile | 8 +-- .../selftests/mm/pkey_sighandler_tests.c | 62 ++++++++++++++----- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 02e1204971b0..0f8c110e0805 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -105,12 +105,12 @@ endif ifeq ($(CAN_BUILD_X86_64),1) TEST_GEN_FILES += $(BINARIES_64) endif -else -ifneq (,$(filter $(ARCH),arm64 powerpc)) +else ifeq ($(ARCH),arm64) +TEST_GEN_FILES += protection_keys +TEST_GEN_FILES += pkey_sighandler_tests +else ifeq ($(ARCH),powerpc) TEST_GEN_FILES += protection_keys -endif - endif ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c index 501880dbdc37..c593a426341c 100644 --- a/tools/testing/selftests/mm/pkey_sighandler_tests.c +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -60,12 +60,44 @@ long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6) : "=a"(ret) : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5) : "memory"); +#elif defined __aarch64__ + register long x0 asm("x0") = a1; + register long x1 asm("x1") = a2; + register long x2 asm("x2") = a3; + register long x3 asm("x3") = a4; + register long x4 asm("x4") = a5; + register long x5 asm("x5") = a6; + register long x8 asm("x8") = n; + asm volatile ("svc #0" + : "=r"(x0) + : "r"(x0), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(x8) + : "memory"); + ret = x0; #else # error syscall_raw() not implemented #endif return ret; } +static inline long clone_raw(unsigned long flags, void *stack, + int *parent_tid, int *child_tid) +{ + long a1 = flags; + long a2 = (long)stack; + long a3 = (long)parent_tid; +#if defined(__x86_64__) || defined(__i386) + long a4 = (long)child_tid; + long a5 = 0; +#elif defined(__aarch64__) + long a4 = 0; + long a5 = (long)child_tid; +#else +# error clone_raw() not implemented +#endif + + return syscall_raw(SYS_clone, a1, a2, a3, a4, a5, 0); +} + /* * Returns the most restrictive pkey register value that can be used by the * tests. @@ -294,14 +326,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) memset(&siginfo, 0, sizeof(siginfo)); /* Use clone to avoid newer glibcs using rseq on new threads */ - long ret = syscall_raw(SYS_clone, - CLONE_VM | CLONE_FS | CLONE_FILES | - CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | - CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | - CLONE_DETACHED, - (long) ((char *)(stack) + STACK_SIZE), - (long) &parent_pid, - (long) &child_pid, 0, 0); + long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + stack + STACK_SIZE, + &parent_pid, + &child_pid); if (ret < 0) { errno = -ret; @@ -466,14 +497,13 @@ static void test_pkru_sigreturn(void) sigstack.ss_size = STACK_SIZE; /* Use clone to avoid newer glibcs using rseq on new threads */ - long ret = syscall_raw(SYS_clone, - CLONE_VM | CLONE_FS | CLONE_FILES | - CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | - CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | - CLONE_DETACHED, - (long) ((char *)(stack) + STACK_SIZE), - (long) &parent_pid, - (long) &child_pid, 0, 0); + long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES | + CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | + CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID | + CLONE_DETACHED, + stack + STACK_SIZE, + &parent_pid, + &child_pid); if (ret < 0) { errno = -ret; From aa47dcda2708e571695dae2e3f9537d9a8eb804c Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 2 Nov 2024 18:42:31 +0800 Subject: [PATCH 129/168] arm64/sysreg: Update ID_AA64MMFR1_EL1 register Update ID_AA64MMFR1_EL1 register fields definition per DDI0601 (ID092424) 2024-09. ID_AA64MMFR1_EL1.ETS adds definition for FEAT_ETS2 and FEAT_ETS3. ID_AA64MMFR1_EL1.HAFDBS adds definition for FEAT_HAFT and FEAT_HDBSS. Reviewed-by: Mark Brown Signed-off-by: Yicong Yang Link: https://lore.kernel.org/r/20241102104235.62560-2-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/tools/sysreg | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8d637ac4b7c6..ae64ba810298 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1648,6 +1648,8 @@ EndEnum UnsignedEnum 39:36 ETS 0b0000 NI 0b0001 IMP + 0b0010 ETS2 + 0b0011 ETS3 EndEnum UnsignedEnum 35:32 TWED 0b0000 NI @@ -1688,6 +1690,8 @@ UnsignedEnum 3:0 HAFDBS 0b0000 NI 0b0001 AF 0b0010 DBM + 0b0011 HAFT + 0b0100 HDBSS EndEnum EndSysreg From 926b66e2ebc8c055b9fea3fb3e5f5b67c80e8e7a Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 2 Nov 2024 18:42:32 +0800 Subject: [PATCH 130/168] arm64: setup: name 'tcr2' register TCR2_EL1 introduced some additional controls besides TCR_EL1. Currently only PIE is supported and enabled by writing TCR2_EL1 directly if PIE detected. Introduce a named register 'tcr2' just like 'tcr' we've already had. It'll be initialized to 0 and updated if certain feature detected and needs to be enabled. Touch the TCR2_EL1 registers at last with the updated 'tcr2' value if FEAT_TCR2 supported by checking ID_AA64MMFR3_EL1.TCRX. Then we can extend the support of other features controlled by TCR2_EL1. Reviewed-by: Catalin Marinas Signed-off-by: Yicong Yang Link: https://lore.kernel.org/r/20241102104235.62560-3-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/proc.S | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 8abdc7fed321..ccbae4525891 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -465,10 +465,12 @@ SYM_FUNC_START(__cpu_setup) */ mair .req x17 tcr .req x16 + tcr2 .req x15 mov_q mair, MAIR_EL1_SET mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \ TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \ TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS + mov tcr2, xzr tcr_clear_errata_bits tcr, x9, x5 @@ -525,11 +527,16 @@ alternative_else_nop_endif #undef PTE_MAYBE_NG #undef PTE_MAYBE_SHARED - mov x0, TCR2_EL1x_PIE - msr REG_TCR2_EL1, x0 + orr tcr2, tcr2, TCR2_EL1x_PIE .Lskip_indirection: + mrs_s x1, SYS_ID_AA64MMFR3_EL1 + ubfx x1, x1, #ID_AA64MMFR3_EL1_TCRX_SHIFT, #4 + cbz x1, 1f + msr REG_TCR2_EL1, tcr2 +1: + /* * Prepare SCTLR */ @@ -538,4 +545,5 @@ alternative_else_nop_endif .unreq mair .unreq tcr + .unreq tcr2 SYM_FUNC_END(__cpu_setup) From baec2397971960e8d5661c616ce20d6a24d7dc4f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 5 Nov 2024 10:39:20 +0100 Subject: [PATCH 131/168] arm64/mm: Sanity check PTE address before runtime P4D/PUD folding The runtime P4D/PUD folding logic assumes that the respective pgd_t* and p4d_t* arguments are pointers into actual page tables that are part of the hierarchy being operated on. This may not always be the case, and we have been bitten once by this already [0], where the argument was actually a stack variable, and in this case, the logic does not work at all. So let's add a VM_BUG_ON() for each case, to ensure that the address of the provided page table entry is consistent with the address being translated. [0] https://lore.kernel.org/all/20240725090345.28461-1-will@kernel.org/T/#u Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241105093919.1312049-2-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0569b2308fd1..073e25f73445 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -921,6 +921,9 @@ static inline phys_addr_t p4d_page_paddr(p4d_t p4d) static inline pud_t *p4d_to_folded_pud(p4d_t *p4dp, unsigned long addr) { + /* Ensure that 'p4dp' indexes a page table according to 'addr' */ + VM_BUG_ON(((addr >> P4D_SHIFT) ^ ((u64)p4dp >> 3)) % PTRS_PER_P4D); + return (pud_t *)PTR_ALIGN_DOWN(p4dp, PAGE_SIZE) + pud_index(addr); } @@ -1045,6 +1048,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr) { + /* Ensure that 'pgdp' indexes a page table according to 'addr' */ + VM_BUG_ON(((addr >> PGDIR_SHIFT) ^ ((u64)pgdp >> 3)) % PTRS_PER_PGD); + return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr); } From dc9b74a76320dd87335956f27f09791fa922ea9b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Nov 2024 10:11:54 +0530 Subject: [PATCH 132/168] arm64/ptdump: Test both PTE_TABLE_BIT and PTE_VALID for block mappings Test both PTE_TABLE_BIT and PTE_VALID for block mappings, similar to KVM S2 ptdump. This ensures consistency in identifying block mappings, both in the S1 and the S2 page tables. Besides being kernel page tables, there will not be any unmapped (!PTE_VALID) block mappings. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20241105044154.4064181-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/ptdump.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 264c5f9b97d8..688fbe0271ca 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -80,10 +80,10 @@ static const struct ptdump_prot_bits pte_bits[] = { .set = "CON", .clear = " ", }, { - .mask = PTE_TABLE_BIT, - .val = PTE_TABLE_BIT, - .set = " ", - .clear = "BLK", + .mask = PTE_TABLE_BIT | PTE_VALID, + .val = PTE_VALID, + .set = "BLK", + .clear = " ", }, { .mask = PTE_UXN, .val = PTE_UXN, From efe72541355d4d40a4f076af453f6533e98e058c Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 2 Nov 2024 18:42:33 +0800 Subject: [PATCH 133/168] arm64: Add support for FEAT_HAFT Armv8.9/v9.4 introduces the feature Hardware managed Access Flag for Table descriptors (FEAT_HAFT). The feature is indicated by ID_AA64MMFR1_EL1.HAFDBS == 0b0011 and can be enabled by TCR2_EL1.HAFT so it has a dependency on FEAT_TCR2. Adds the Kconfig for FEAT_HAFT and support detecting and enabling the feature. The feature is enabled in __cpu_setup() before MMU on just like HA. A CPU capability is added to notify the user of the feature. Add definition of P{G,4,U,M}D_TABLE_AF bit and set the AF bit when creating the page table, which will save the hardware from having to update them at runtime. This will be ignored if FEAT_HAFT is not enabled. The AF bit of table descriptors cannot be managed by the software per spec, unlike the HA. So this should be used only if it's supported system wide by system_supports_haft(). Signed-off-by: Yicong Yang Link: https://lore.kernel.org/r/20241102104235.62560-4-yangyicong@huawei.com Reviewed-by: Catalin Marinas [catalin.marinas@arm.com: added the ID check back to __cpu_setup in case of future CPU errata] Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 15 +++++++++++++++ arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/include/asm/pgalloc.h | 12 +++++++----- arch/arm64/include/asm/pgtable-hwdef.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 15 +++++++++++++++ arch/arm64/mm/fixmap.c | 9 ++++++--- arch/arm64/mm/mmu.c | 8 ++++---- arch/arm64/mm/proc.S | 7 ++++++- arch/arm64/tools/cpucaps | 1 + 9 files changed, 64 insertions(+), 13 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3e29b44d2d7b..b90f92f89d54 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2176,6 +2176,21 @@ config ARCH_PKEY_BITS int default 3 +config ARM64_HAFT + bool "Support for Hardware managed Access Flag for Table Descriptors" + depends on ARM64_HW_AFDBM + default y + help + The ARMv8.9/ARMv9.5 introduces the feature Hardware managed Access + Flag for Table descriptors. When enabled an architectural executed + memory access will update the Access Flag in each Table descriptor + which is accessed during the translation table walk and for which + the Access Flag is 0. The Access Flag of the Table descriptor use + the same bit of PTE_AF. + + The feature will only be enabled if all the CPUs in the system + support this feature. If unsure, say Y. + endmenu # "ARMv8.9 architectural features" config ARM64_SVE diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3d261cc123c1..ed8c784ca082 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -838,6 +838,12 @@ static inline bool system_supports_poe(void) alternative_has_cap_unlikely(ARM64_HAS_S1POE); } +static inline bool system_supports_haft(void) +{ + return IS_ENABLED(CONFIG_ARM64_HAFT) && + cpus_have_final_cap(ARM64_HAFT); +} + int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); bool try_emulate_mrs(struct pt_regs *regs, u32 isn); diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 8ff5f2a2579e..e75422864d1b 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -28,7 +28,7 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) { - pudval_t pudval = PUD_TYPE_TABLE; + pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_AF; pudval |= (mm == &init_mm) ? PUD_TABLE_UXN : PUD_TABLE_PXN; __pud_populate(pudp, __pa(pmdp), pudval); @@ -50,7 +50,7 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot) static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) { - p4dval_t p4dval = P4D_TYPE_TABLE; + p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_AF; p4dval |= (mm == &init_mm) ? P4D_TABLE_UXN : P4D_TABLE_PXN; __p4d_populate(p4dp, __pa(pudp), p4dval); @@ -79,7 +79,7 @@ static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot) static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) { - pgdval_t pgdval = PGD_TYPE_TABLE; + pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_AF; pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN; __pgd_populate(pgdp, __pa(p4dp), pgdval); @@ -127,14 +127,16 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep) { VM_BUG_ON(mm && mm != &init_mm); - __pmd_populate(pmdp, __pa(ptep), PMD_TYPE_TABLE | PMD_TABLE_UXN); + __pmd_populate(pmdp, __pa(ptep), + PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_UXN); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) { VM_BUG_ON(mm == &init_mm); - __pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN); + __pmd_populate(pmdp, page_to_phys(ptep), + PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_PXN); } #endif diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index fd330c1db289..c78a988cca93 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -99,6 +99,7 @@ #define PGD_TYPE_TABLE (_AT(pgdval_t, 3) << 0) #define PGD_TABLE_BIT (_AT(pgdval_t, 1) << 1) #define PGD_TYPE_MASK (_AT(pgdval_t, 3) << 0) +#define PGD_TABLE_AF (_AT(pgdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ #define PGD_TABLE_PXN (_AT(pgdval_t, 1) << 59) #define PGD_TABLE_UXN (_AT(pgdval_t, 1) << 60) @@ -110,6 +111,7 @@ #define P4D_TYPE_MASK (_AT(p4dval_t, 3) << 0) #define P4D_TYPE_SECT (_AT(p4dval_t, 1) << 0) #define P4D_SECT_RDONLY (_AT(p4dval_t, 1) << 7) /* AP[2] */ +#define P4D_TABLE_AF (_AT(p4dval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ #define P4D_TABLE_PXN (_AT(p4dval_t, 1) << 59) #define P4D_TABLE_UXN (_AT(p4dval_t, 1) << 60) @@ -121,6 +123,7 @@ #define PUD_TYPE_MASK (_AT(pudval_t, 3) << 0) #define PUD_TYPE_SECT (_AT(pudval_t, 1) << 0) #define PUD_SECT_RDONLY (_AT(pudval_t, 1) << 7) /* AP[2] */ +#define PUD_TABLE_AF (_AT(pudval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ #define PUD_TABLE_PXN (_AT(pudval_t, 1) << 59) #define PUD_TABLE_UXN (_AT(pudval_t, 1) << 60) @@ -131,6 +134,7 @@ #define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0) #define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0) #define PMD_TABLE_BIT (_AT(pmdval_t, 1) << 1) +#define PMD_TABLE_AF (_AT(pmdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */ /* * Section diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 718728a85430..878712fa0d3b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2590,6 +2590,21 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpus = &dbm_cpus, ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM) }, +#endif +#ifdef CONFIG_ARM64_HAFT + { + .desc = "Hardware managed Access Flag for Table Descriptors", + /* + * Contrary to the page/block access flag, the table access flag + * cannot be emulated in software (no access fault will occur). + * Therefore this should be used only if it's supported system + * wide. + */ + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAFT, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT) + }, #endif { .desc = "CRC32 instructions", diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c index de1e09d986ad..c5c5425791da 100644 --- a/arch/arm64/mm/fixmap.c +++ b/arch/arm64/mm/fixmap.c @@ -47,7 +47,8 @@ static void __init early_fixmap_init_pte(pmd_t *pmdp, unsigned long addr) if (pmd_none(pmd)) { ptep = bm_pte[BM_PTE_TABLE_IDX(addr)]; - __pmd_populate(pmdp, __pa_symbol(ptep), PMD_TYPE_TABLE); + __pmd_populate(pmdp, __pa_symbol(ptep), + PMD_TYPE_TABLE | PMD_TABLE_AF); } } @@ -59,7 +60,8 @@ static void __init early_fixmap_init_pmd(pud_t *pudp, unsigned long addr, pmd_t *pmdp; if (pud_none(pud)) - __pud_populate(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE); + __pud_populate(pudp, __pa_symbol(bm_pmd), + PUD_TYPE_TABLE | PUD_TABLE_AF); pmdp = pmd_offset_kimg(pudp, addr); do { @@ -86,7 +88,8 @@ static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr, } if (p4d_none(p4d)) - __p4d_populate(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE); + __p4d_populate(p4dp, __pa_symbol(bm_pud), + P4D_TYPE_TABLE | P4D_TABLE_AF); pudp = pud_offset_kimg(p4dp, addr); early_fixmap_init_pmd(pudp, addr, end); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e55b02fbddc8..6441a45eaeda 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -201,7 +201,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, BUG_ON(pmd_sect(pmd)); if (pmd_none(pmd)) { - pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN; + pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; phys_addr_t pte_phys; if (flags & NO_EXEC_MAPPINGS) @@ -288,7 +288,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, */ BUG_ON(pud_sect(pud)); if (pud_none(pud)) { - pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN; + pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; phys_addr_t pmd_phys; if (flags & NO_EXEC_MAPPINGS) @@ -333,7 +333,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, pud_t *pudp; if (p4d_none(p4d)) { - p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN; + p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF; phys_addr_t pud_phys; if (flags & NO_EXEC_MAPPINGS) @@ -391,7 +391,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, p4d_t *p4dp; if (pgd_none(pgd)) { - pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN; + pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF; phys_addr_t p4d_phys; if (flags & NO_EXEC_MAPPINGS) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index ccbae4525891..b8edc5765441 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -495,9 +495,14 @@ alternative_else_nop_endif * via capabilities. */ mrs x9, ID_AA64MMFR1_EL1 - and x9, x9, ID_AA64MMFR1_EL1_HAFDBS_MASK + ubfx x9, x9, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, #4 cbz x9, 1f orr tcr, tcr, #TCR_HA // hardware Access flag update +#ifdef CONFIG_ARM64_HAFT + cmp x9, ID_AA64MMFR1_EL1_HAFDBS_HAFT + b.lt 1f + orr tcr2, tcr2, TCR2_EL1x_HAFT +#endif /* CONFIG_ARM64_HAFT */ 1: #endif /* CONFIG_ARM64_HW_AFDBM */ msr mair_el1, mair diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index eedb5acc21ed..b35004fa8313 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -56,6 +56,7 @@ HAS_TLB_RANGE HAS_VA52 HAS_VIRT_HOST_EXTN HAS_WFXT +HAFT HW_DBM KVM_HVHE KVM_PROTECTED_MODE From 62df5870ebf7cec96a51c9b9008daf167e22db14 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 2 Nov 2024 18:42:34 +0800 Subject: [PATCH 134/168] arm64: Enable ARCH_HAS_NONLEAF_PMD_YOUNG With the support of FEAT_HAFT, the NONLEAF_PMD_YOUNG can be enabled on arm64 since the hardware is capable of updating the AF flag for PMD table descriptor. Since the AF bit of the table descriptor shares the same bit position in block descriptors, we only need to implement arch_has_hw_nonleaf_pmd_young() and select related configs. The related pmd_young test/update operations keeps the same with and already implemented for transparent page support. Currently ARCH_HAS_NONLEAF_PMD_YOUNG is used to improve the efficiency of lru-gen aging. Signed-off-by: Yicong Yang Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20241102104235.62560-5-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b90f92f89d54..d2ca7b6c6f85 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -38,6 +38,7 @@ config ARM64 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_HW_PTE_YOUNG diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index c329ea061dc9..5182feabd943 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1259,7 +1259,7 @@ static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, return young; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, @@ -1267,7 +1267,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, { return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) @@ -1502,6 +1502,10 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, */ #define arch_has_hw_pte_young cpu_has_hw_af +#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG +#define arch_has_hw_nonleaf_pmd_young system_supports_haft +#endif + /* * Experimentally, it's cheap to set the access flag in hardware and we * benefit from prefaulting mappings as 'old' to start with. From b349a5a2b6e236b25095c6ff886b3451de5ea041 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Sat, 2 Nov 2024 18:42:35 +0800 Subject: [PATCH 135/168] arm64: pgtable: Warn unexpected pmdp_test_and_clear_young() Young bit operation on PMD table entry is only supported if FEAT_HAFT enabled system wide. Add a warning for notifying the misbehaviour. Signed-off-by: Yicong Yang Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20241102104235.62560-6-yangyicong@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 5182feabd943..160ca503c99f 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1265,6 +1265,8 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + /* Operation applies to PMD table entry only if FEAT_HAFT is enabled */ + VM_WARN_ON(pmd_table(READ_ONCE(*pmdp)) && !system_supports_haft()); return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ From 845fd2cbedaf299ee61680a678b279dfeb6fe77c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 27 Oct 2024 19:03:14 +0100 Subject: [PATCH 136/168] perf: Switch back to struct platform_driver::remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 0edb555a65d1 ("platform: Make platform_driver::remove() return void") .remove() is (again) the right callback to implement for platform drivers. Convert all platform drivers below drivers/perf to use .remove(), with the eventual goal to drop struct platform_driver::remove_new(). As .remove() and .remove_new() have the same prototypes, conversion is done by just changing the structure member name in the driver initializer. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20241027180313.410964-2-u.kleine-koenig@baylibre.com Signed-off-by: Will Deacon --- drivers/perf/alibaba_uncore_drw_pmu.c | 2 +- drivers/perf/amlogic/meson_g12_ddr_pmu.c | 2 +- drivers/perf/arm-cci.c | 2 +- drivers/perf/arm-ccn.c | 2 +- drivers/perf/arm-cmn.c | 2 +- drivers/perf/arm_cspmu/arm_cspmu.c | 2 +- drivers/perf/arm_dmc620_pmu.c | 2 +- drivers/perf/arm_dsu_pmu.c | 2 +- drivers/perf/arm_smmuv3_pmu.c | 2 +- drivers/perf/arm_spe_pmu.c | 2 +- drivers/perf/fsl_imx8_ddr_perf.c | 2 +- drivers/perf/fsl_imx9_ddr_perf.c | 2 +- drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_hha_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_pa_pmu.c | 2 +- drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c | 2 +- drivers/perf/marvell_cn10k_ddr_pmu.c | 2 +- drivers/perf/marvell_cn10k_tad_pmu.c | 2 +- drivers/perf/qcom_l2_pmu.c | 2 +- drivers/perf/thunderx2_pmu.c | 2 +- drivers/perf/xgene_pmu.c | 2 +- 23 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c index c6ff1bc7d336..99a0ef9817e0 100644 --- a/drivers/perf/alibaba_uncore_drw_pmu.c +++ b/drivers/perf/alibaba_uncore_drw_pmu.c @@ -782,7 +782,7 @@ static struct platform_driver ali_drw_pmu_driver = { .acpi_match_table = ali_drw_acpi_match, }, .probe = ali_drw_pmu_probe, - .remove_new = ali_drw_pmu_remove, + .remove = ali_drw_pmu_remove, }; static int __init ali_drw_pmu_init(void) diff --git a/drivers/perf/amlogic/meson_g12_ddr_pmu.c b/drivers/perf/amlogic/meson_g12_ddr_pmu.c index 99cc791892bc..f33e9a456e85 100644 --- a/drivers/perf/amlogic/meson_g12_ddr_pmu.c +++ b/drivers/perf/amlogic/meson_g12_ddr_pmu.c @@ -379,7 +379,7 @@ MODULE_DEVICE_TABLE(of, meson_ddr_pmu_dt_match); static struct platform_driver g12_ddr_pmu_driver = { .probe = g12_ddr_pmu_probe, - .remove_new = g12_ddr_pmu_remove, + .remove = g12_ddr_pmu_remove, .driver = { .name = "meson-g12-ddr-pmu", diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c index c76bac668dea..1cc3214d6b6d 100644 --- a/drivers/perf/arm-cci.c +++ b/drivers/perf/arm-cci.c @@ -1705,7 +1705,7 @@ static struct platform_driver cci_pmu_driver = { .suppress_bind_attrs = true, }, .probe = cci_pmu_probe, - .remove_new = cci_pmu_remove, + .remove = cci_pmu_remove, }; module_platform_driver(cci_pmu_driver); diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c index 5c66b9278862..d5fcea3d4328 100644 --- a/drivers/perf/arm-ccn.c +++ b/drivers/perf/arm-ccn.c @@ -1529,7 +1529,7 @@ static struct platform_driver arm_ccn_driver = { .suppress_bind_attrs = true, }, .probe = arm_ccn_probe, - .remove_new = arm_ccn_remove, + .remove = arm_ccn_remove, }; static int __init arm_ccn_init(void) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 397a46410f7c..49bd811c6fd6 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2662,7 +2662,7 @@ static struct platform_driver arm_cmn_driver = { .acpi_match_table = ACPI_PTR(arm_cmn_acpi_match), }, .probe = arm_cmn_probe, - .remove_new = arm_cmn_remove, + .remove = arm_cmn_remove, }; static int __init arm_cmn_init(void) diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c index 2158a5975c90..81e8b97e9353 100644 --- a/drivers/perf/arm_cspmu/arm_cspmu.c +++ b/drivers/perf/arm_cspmu/arm_cspmu.c @@ -1282,7 +1282,7 @@ static struct platform_driver arm_cspmu_driver = { .suppress_bind_attrs = true, }, .probe = arm_cspmu_device_probe, - .remove_new = arm_cspmu_device_remove, + .remove = arm_cspmu_device_remove, .id_table = arm_cspmu_id, }; diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c index 7e5f1d4fca0f..619cf937602f 100644 --- a/drivers/perf/arm_dmc620_pmu.c +++ b/drivers/perf/arm_dmc620_pmu.c @@ -750,7 +750,7 @@ static struct platform_driver dmc620_pmu_driver = { .suppress_bind_attrs = true, }, .probe = dmc620_pmu_device_probe, - .remove_new = dmc620_pmu_device_remove, + .remove = dmc620_pmu_device_remove, }; static int __init dmc620_pmu_init(void) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index f2bd25a3470a..cb4fb59fe04b 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -787,7 +787,7 @@ static struct platform_driver dsu_pmu_driver = { .suppress_bind_attrs = true, }, .probe = dsu_pmu_device_probe, - .remove_new = dsu_pmu_device_remove, + .remove = dsu_pmu_device_remove, }; static int dsu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index d5fa92ba8373..b1510f660c7a 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -996,7 +996,7 @@ static struct platform_driver smmu_pmu_driver = { .suppress_bind_attrs = true, }, .probe = smmu_pmu_probe, - .remove_new = smmu_pmu_remove, + .remove = smmu_pmu_remove, .shutdown = smmu_pmu_shutdown, }; diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 3569050f9cf3..fd5b78732603 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -1280,7 +1280,7 @@ static struct platform_driver arm_spe_pmu_driver = { .suppress_bind_attrs = true, }, .probe = arm_spe_pmu_device_probe, - .remove_new = arm_spe_pmu_device_remove, + .remove = arm_spe_pmu_device_remove, }; static int __init arm_spe_pmu_init(void) diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c index 746b92330ca7..b989ffa95d69 100644 --- a/drivers/perf/fsl_imx8_ddr_perf.c +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -846,7 +846,7 @@ static struct platform_driver imx_ddr_pmu_driver = { .suppress_bind_attrs = true, }, .probe = ddr_perf_probe, - .remove_new = ddr_perf_remove, + .remove = ddr_perf_remove, }; module_platform_driver(imx_ddr_pmu_driver); diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c index fe1a51f64751..3c856d9a4e97 100644 --- a/drivers/perf/fsl_imx9_ddr_perf.c +++ b/drivers/perf/fsl_imx9_ddr_perf.c @@ -853,7 +853,7 @@ static struct platform_driver imx_ddr_pmu_driver = { .suppress_bind_attrs = true, }, .probe = ddr_perf_probe, - .remove_new = ddr_perf_remove, + .remove = ddr_perf_remove, }; module_platform_driver(imx_ddr_pmu_driver); diff --git a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c index 0e923f94fa5b..3f3fb1de11f5 100644 --- a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c @@ -358,7 +358,7 @@ static struct platform_driver hisi_cpa_pmu_driver = { .suppress_bind_attrs = true, }, .probe = hisi_cpa_pmu_probe, - .remove_new = hisi_cpa_pmu_remove, + .remove = hisi_cpa_pmu_remove, }; static int __init hisi_cpa_pmu_module_init(void) diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c index b804e3738113..a6ebf2ec99d3 100644 --- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c @@ -547,7 +547,7 @@ static struct platform_driver hisi_ddrc_pmu_driver = { .suppress_bind_attrs = true, }, .probe = hisi_ddrc_pmu_probe, - .remove_new = hisi_ddrc_pmu_remove, + .remove = hisi_ddrc_pmu_remove, }; static int __init hisi_ddrc_pmu_module_init(void) diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c index 21e69b1cdd4d..32624872596f 100644 --- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c @@ -550,7 +550,7 @@ static struct platform_driver hisi_hha_pmu_driver = { .suppress_bind_attrs = true, }, .probe = hisi_hha_pmu_probe, - .remove_new = hisi_hha_pmu_remove, + .remove = hisi_hha_pmu_remove, }; static int __init hisi_hha_pmu_module_init(void) diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 51ba76871097..c235b46ce873 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -584,7 +584,7 @@ static struct platform_driver hisi_l3c_pmu_driver = { .suppress_bind_attrs = true, }, .probe = hisi_l3c_pmu_probe, - .remove_new = hisi_l3c_pmu_remove, + .remove = hisi_l3c_pmu_remove, }; static int __init hisi_l3c_pmu_module_init(void) diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c index 3cdb35c741f9..c0f5d7c73e06 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c @@ -538,7 +538,7 @@ static struct platform_driver hisi_pa_pmu_driver = { .suppress_bind_attrs = true, }, .probe = hisi_pa_pmu_probe, - .remove_new = hisi_pa_pmu_remove, + .remove = hisi_pa_pmu_remove, }; static int __init hisi_pa_pmu_module_init(void) diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c index 765bbd61db26..c5f4764ee888 100644 --- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c @@ -476,7 +476,7 @@ static struct platform_driver hisi_sllc_pmu_driver = { .suppress_bind_attrs = true, }, .probe = hisi_sllc_pmu_probe, - .remove_new = hisi_sllc_pmu_remove, + .remove = hisi_sllc_pmu_remove, }; static int __init hisi_sllc_pmu_module_init(void) diff --git a/drivers/perf/marvell_cn10k_ddr_pmu.c b/drivers/perf/marvell_cn10k_ddr_pmu.c index 94f1ebcd2a27..8860d9f687ae 100644 --- a/drivers/perf/marvell_cn10k_ddr_pmu.c +++ b/drivers/perf/marvell_cn10k_ddr_pmu.c @@ -732,7 +732,7 @@ static struct platform_driver cn10k_ddr_pmu_driver = { .suppress_bind_attrs = true, }, .probe = cn10k_ddr_perf_probe, - .remove_new = cn10k_ddr_perf_remove, + .remove = cn10k_ddr_perf_remove, }; static int __init cn10k_ddr_pmu_init(void) diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c index 9e635f355470..cda55ee35eee 100644 --- a/drivers/perf/marvell_cn10k_tad_pmu.c +++ b/drivers/perf/marvell_cn10k_tad_pmu.c @@ -383,7 +383,7 @@ static struct platform_driver tad_pmu_driver = { .suppress_bind_attrs = true, }, .probe = tad_pmu_probe, - .remove_new = tad_pmu_remove, + .remove = tad_pmu_remove, }; static int tad_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c index 980e3051edd7..ea8c85729937 100644 --- a/drivers/perf/qcom_l2_pmu.c +++ b/drivers/perf/qcom_l2_pmu.c @@ -981,7 +981,7 @@ static struct platform_driver l2_cache_pmu_driver = { .suppress_bind_attrs = true, }, .probe = l2_cache_pmu_probe, - .remove_new = l2_cache_pmu_remove, + .remove = l2_cache_pmu_remove, }; static int __init register_l2_cache_pmu_driver(void) diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c index faf763d2c95c..cadd60221b8f 100644 --- a/drivers/perf/thunderx2_pmu.c +++ b/drivers/perf/thunderx2_pmu.c @@ -1010,7 +1010,7 @@ static struct platform_driver tx2_uncore_driver = { .suppress_bind_attrs = true, }, .probe = tx2_uncore_probe, - .remove_new = tx2_uncore_remove, + .remove = tx2_uncore_remove, }; static int __init tx2_uncore_driver_init(void) diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c index c01466ae1e3d..33b5497bdc06 100644 --- a/drivers/perf/xgene_pmu.c +++ b/drivers/perf/xgene_pmu.c @@ -1943,7 +1943,7 @@ static void xgene_pmu_remove(struct platform_device *pdev) static struct platform_driver xgene_pmu_driver = { .probe = xgene_pmu_probe, - .remove_new = xgene_pmu_remove, + .remove = xgene_pmu_remove, .driver = { .name = "xgene-pmu", .of_match_table = xgene_pmu_of_match, From 340fd66c856651d8c1d29f392dd26ad674d2db0e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Nov 2024 01:18:42 +0900 Subject: [PATCH 137/168] arm64: fix .data.rel.ro size assertion when CONFIG_LTO_CLANG Commit be2881824ae9 ("arm64/build: Assert for unwanted sections") introduced an assertion to ensure that the .data.rel.ro section does not exist. However, this check does not work when CONFIG_LTO_CLANG is enabled, because .data.rel.ro matches the .data.[0-9a-zA-Z_]* pattern in the DATA_MAIN macro. Move the ASSERT() above the RW_DATA() line. Fixes: be2881824ae9 ("arm64/build: Assert for unwanted sections") Signed-off-by: Masahiro Yamada Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241106161843.189927-1-masahiroy@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vmlinux.lds.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 58d89d997d05..f84c71f04d9e 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -287,6 +287,9 @@ SECTIONS __initdata_end = .; __init_end = .; + .data.rel.ro : { *(.data.rel.ro) } + ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") + _data = .; _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) @@ -343,9 +346,6 @@ SECTIONS *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt) } ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!") - - .data.rel.ro : { *(.data.rel.ro) } - ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!") } #include "image-vars.h" From a3590d71a1acef850864f19bff2f37f56b2d4f00 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 30 Oct 2024 00:02:02 +0000 Subject: [PATCH 138/168] kselftest/arm64: Increase frequency of signal delivery in fp-stress Currently we only deliver signals to the processes being tested about once a second, meaning that the signal code paths are subject to relatively little stress. Increase this frequency substantially to 25ms intervals, along with some minor refactoring to make this more readily tuneable and maintain the 1s logging interval. This interval was chosen based on some experimentation with emulated platforms to avoid causing so much extra load that the test starts to run into the 45s limit for selftests or generally completely disconnect the timeout numbers from the We could increase this if we moved the signal generation out of the main supervisor thread, though we should also consider that he percentage of time that we spend interacting with the floating point state is also a consideration. Suggested-by: Mark Rutland Signed-off-by: Mark Brown Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241030-arm64-fp-stress-interval-v2-1-bd3cef48c22c@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fp-stress.c | 26 +++++++++++--------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index 13958e645afc..b81bc0842f17 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -28,6 +28,9 @@ #define MAX_VLS 16 +#define SIGNAL_INTERVAL_MS 25 +#define LOG_INTERVALS (1000 / SIGNAL_INTERVAL_MS) + struct child_data { char *name, *output; pid_t pid; @@ -448,7 +451,7 @@ static const struct option options[] = { int main(int argc, char **argv) { int ret; - int timeout = 10; + int timeout = 10 * (1000 / SIGNAL_INTERVAL_MS); int cpus, i, j, c; int sve_vl_count, sme_vl_count; bool all_children_started = false; @@ -504,7 +507,7 @@ int main(int argc, char **argv) have_sme2 ? "present" : "absent"); if (timeout > 0) - ksft_print_msg("Will run for %ds\n", timeout); + ksft_print_msg("Will run for %d\n", timeout); else ksft_print_msg("Will run until terminated\n"); @@ -577,14 +580,14 @@ int main(int argc, char **argv) break; /* - * Timeout is counted in seconds with no output, the - * tests print during startup then are silent when - * running so this should ensure they all ran enough - * to install the signal handler, this is especially - * useful in emulation where we will both be slow and - * likely to have a large set of VLs. + * Timeout is counted in poll intervals with no + * output, the tests print during startup then are + * silent when running so this should ensure they all + * ran enough to install the signal handler, this is + * especially useful in emulation where we will both + * be slow and likely to have a large set of VLs. */ - ret = epoll_wait(epoll_fd, evs, tests, 1000); + ret = epoll_wait(epoll_fd, evs, tests, SIGNAL_INTERVAL_MS); if (ret < 0) { if (errno == EINTR) continue; @@ -624,8 +627,9 @@ int main(int argc, char **argv) all_children_started = true; } - ksft_print_msg("Sending signals, timeout remaining: %d\n", - timeout); + if ((timeout % LOG_INTERVALS) == 0) + ksft_print_msg("Sending signals, timeout remaining: %d\n", + timeout); for (i = 0; i < num_children; i++) child_tickle(&children[i]); From 161e9925053cafa83a1eb265a001b6917dfafa29 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 30 Oct 2024 00:02:03 +0000 Subject: [PATCH 139/168] kselftest/arm64: Poll less often while waiting for fp-stress children While fp-stress is waiting for children to start it doesn't send any signals to them so there is no need for it to have as short an epoll() timeout as it does when the children are all running. We do still want to have some timeout so that we can log diagnostics about missing children but this can be relatively large. On emulated platforms the overhead of running the supervisor process is quite high, especially during the process of execing the test binaries. Implement a longer epoll() timeout during the setup phase, using a 5s timeout while waiting for children and switching to the signal raise interval when all the children are started and we start sending signals. Signed-off-by: Mark Brown Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241030-arm64-fp-stress-interval-v2-2-bd3cef48c22c@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fp-stress.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index b81bc0842f17..ad867ff9687a 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -452,6 +452,7 @@ int main(int argc, char **argv) { int ret; int timeout = 10 * (1000 / SIGNAL_INTERVAL_MS); + int poll_interval = 5000; int cpus, i, j, c; int sve_vl_count, sme_vl_count; bool all_children_started = false; @@ -587,7 +588,7 @@ int main(int argc, char **argv) * especially useful in emulation where we will both * be slow and likely to have a large set of VLs. */ - ret = epoll_wait(epoll_fd, evs, tests, SIGNAL_INTERVAL_MS); + ret = epoll_wait(epoll_fd, evs, tests, poll_interval); if (ret < 0) { if (errno == EINTR) continue; @@ -625,6 +626,7 @@ int main(int argc, char **argv) } all_children_started = true; + poll_interval = SIGNAL_INTERVAL_MS; } if ((timeout % LOG_INTERVALS) == 0) From 94de486e4215b83de8a5e908b4b8f7d3979c3b0e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 7 Nov 2024 01:39:20 +0000 Subject: [PATCH 140/168] kselftest/arm64: Correct misleading comments on fp-stress irritators The comments in the handlers for the irritator signal in the test threads for fp-stress suggest that the irritator will corrupt the register state observed by the main thread but this is not the case, instead the FPSIMD and SVE irritators (which are the only ones that are implemented) modify the current register state which is expected to be overwritten on return from the handler by the saved register state. Update the comment to reflect what the handler is actually doing. Acked-by: Mark Rutland Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-1-c4b9622e36ee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fpsimd-test.S | 3 +-- tools/testing/selftests/arm64/fp/sve-test.S | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S index 8b960d01ed2e..bdfb7cf2e4ec 100644 --- a/tools/testing/selftests/arm64/fp/fpsimd-test.S +++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S @@ -134,8 +134,7 @@ function check_vreg b memcmp endfunction -// Any SVE register modified here can cause corruption in the main -// thread -- but *only* the registers modified here. +// Modify live register state, the signal return will undo our changes function irritator_handler // Increment the irritation signal count (x23): ldr x0, [x2, #ucontext_regs + 8 * 23] diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index fff60e2a25ad..e3c0d585684d 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -291,8 +291,7 @@ function check_ffr #endif endfunction -// Any SVE register modified here can cause corruption in the main -// thread -- but *only* the registers modified here. +// Modify live register state, the signal return will undo our changes function irritator_handler // Increment the irritation signal count (x23): ldr x0, [x2, #ucontext_regs + 8 * 23] From ffca567fef9c4661c792ca48a2cdd038b2df4887 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 7 Nov 2024 01:39:21 +0000 Subject: [PATCH 141/168] kselftest/arm64: Remove unused ADRs from irritator handlers The irritator handlers for the fp-stress test programs all use ADR to load an address into x0 which is then not referenced. Remove these ADRs as they just cause confusion. Acked-by: Mark Rutland Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-2-c4b9622e36ee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fpsimd-test.S | 1 - tools/testing/selftests/arm64/fp/sve-test.S | 1 - tools/testing/selftests/arm64/fp/za-test.S | 1 - tools/testing/selftests/arm64/fp/zt-test.S | 1 - 4 files changed, 4 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S index bdfb7cf2e4ec..9977ffdd758a 100644 --- a/tools/testing/selftests/arm64/fp/fpsimd-test.S +++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S @@ -142,7 +142,6 @@ function irritator_handler str x0, [x2, #ucontext_regs + 8 * 23] // Corrupt some random V-regs - adr x0, .text + (irritator_handler - .text) / 16 * 16 movi v0.8b, #7 movi v9.16b, #9 movi v31.8b, #31 diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index e3c0d585684d..f1fb9745c681 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -299,7 +299,6 @@ function irritator_handler str x0, [x2, #ucontext_regs + 8 * 23] // Corrupt some random Z-regs - adr x0, .text + (irritator_handler - .text) / 16 * 16 movi v0.8b, #1 movi v9.16b, #2 movi v31.8b, #3 diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S index 095b45531640..1ee0ec36766d 100644 --- a/tools/testing/selftests/arm64/fp/za-test.S +++ b/tools/testing/selftests/arm64/fp/za-test.S @@ -158,7 +158,6 @@ function irritator_handler // Corrupt some random ZA data #if 0 - adr x0, .text + (irritator_handler - .text) / 16 * 16 movi v0.8b, #1 movi v9.16b, #2 movi v31.8b, #3 diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index b5c81e81a379..ade9c98abcda 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -127,7 +127,6 @@ function irritator_handler // Corrupt some random ZT data #if 0 - adr x0, .text + (irritator_handler - .text) / 16 * 16 movi v0.8b, #1 movi v9.16b, #2 movi v31.8b, #3 From d65f27d240bb897ac5a4a3fb7557724b3717e022 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 7 Nov 2024 01:39:23 +0000 Subject: [PATCH 142/168] kselftest/arm64: Implement irritators for ZA and ZT Currently we don't use the irritator signal in our floating point stress tests so when we added ZA and ZT stress tests we didn't actually bother implementing any actual action in the handlers, we just counted the signal deliveries. In preparation for using the irritators let's implement them, just trivially SMSTOP and SMSTART to reset all bits in the register to 0. Acked-by: Mark Rutland Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-4-c4b9622e36ee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/za-test.S | 12 ++++-------- tools/testing/selftests/arm64/fp/zt-test.S | 12 ++++-------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S index 1ee0ec36766d..f902e6ef9077 100644 --- a/tools/testing/selftests/arm64/fp/za-test.S +++ b/tools/testing/selftests/arm64/fp/za-test.S @@ -148,20 +148,16 @@ function check_za b memcmp endfunction -// Any SME register modified here can cause corruption in the main -// thread -- but *only* the locations modified here. +// Modify the live SME register state, signal return will undo our changes function irritator_handler // Increment the irritation signal count (x23): ldr x0, [x2, #ucontext_regs + 8 * 23] add x0, x0, #1 str x0, [x2, #ucontext_regs + 8 * 23] - // Corrupt some random ZA data -#if 0 - movi v0.8b, #1 - movi v9.16b, #2 - movi v31.8b, #3 -#endif + // This will reset ZA to all bits 0 + smstop + smstart ret endfunction diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index ade9c98abcda..c96cb7c2ad4b 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -117,20 +117,16 @@ function check_zt b memcmp endfunction -// Any SME register modified here can cause corruption in the main -// thread -- but *only* the locations modified here. +// Modify the live SME register state, signal return will undo our changes function irritator_handler // Increment the irritation signal count (x23): ldr x0, [x2, #ucontext_regs + 8 * 23] add x0, x0, #1 str x0, [x2, #ucontext_regs + 8 * 23] - // Corrupt some random ZT data -#if 0 - movi v0.8b, #1 - movi v9.16b, #2 - movi v31.8b, #3 -#endif + // This will reset ZT to all bits 0 + smstop + smstart ret endfunction From 7368debf275aa2419365ce47da00922ca51c6094 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 7 Nov 2024 01:39:24 +0000 Subject: [PATCH 143/168] kselftest/arm64: Provide a SIGUSR1 handler in the kernel mode FP stress test The other stress test programs provide a SIGUSR1 handler which modifies the live register state in order to validate that signal context is being restored during signal return. While we can't usefully do this when testing kernel mode FP usage provide a handler for SIGUSR1 which just counts the number of signals like we do for SIGUSR2, allowing fp-stress to treat all the test programs uniformly. Acked-by: Mark Rutland Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-5-c4b9622e36ee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/kernel-test.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c index e8da3b4cbd23..859345379044 100644 --- a/tools/testing/selftests/arm64/fp/kernel-test.c +++ b/tools/testing/selftests/arm64/fp/kernel-test.c @@ -267,6 +267,10 @@ int main(void) strerror(errno), errno); sa.sa_sigaction = handle_kick_signal; + ret = sigaction(SIGUSR1, &sa, NULL); + if (ret < 0) + printf("Failed to install SIGUSR1 handler: %s (%d)\n", + strerror(errno), errno); ret = sigaction(SIGUSR2, &sa, NULL); if (ret < 0) printf("Failed to install SIGUSR2 handler: %s (%d)\n", From ead1c35ce3b3c766443a82b56ee343cfe7ee8305 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 7 Nov 2024 01:39:25 +0000 Subject: [PATCH 144/168] kselftest/arm64: Test signal handler state modification in fp-stress Currently in fp-stress we test signal delivery to the test threads by sending SIGUSR2 which simply counts how many signals are delivered. The test programs now also all have a SIGUSR1 handler which for the threads doing userspace testing additionally modifies the floating point register state in the signal handler, verifying that when we return the saved register state is restored from the signal context as expected. Switch over to triggering that to validate that we are restoring as expected. Acked-by: Mark Rutland Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-6-c4b9622e36ee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fp-stress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c index ad867ff9687a..74e23208b94c 100644 --- a/tools/testing/selftests/arm64/fp/fp-stress.c +++ b/tools/testing/selftests/arm64/fp/fp-stress.c @@ -223,7 +223,7 @@ static void child_output(struct child_data *child, uint32_t events, static void child_tickle(struct child_data *child) { if (child->output_seen && !child->exited) - kill(child->pid, SIGUSR2); + kill(child->pid, SIGUSR1); } static void child_stop(struct child_data *child) From db64dfffcad2992d6bfc680822bdf715335c43f1 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Thu, 7 Nov 2024 13:16:40 +0000 Subject: [PATCH 145/168] selftests/mm: Define PKEY_UNRESTRICTED for pkey_sighandler_tests Commit 6e182dc9f268 ("selftests/mm: Use generic pkey register manipulation") makes use of PKEY_UNRESTRICTED in pkey_sighandler_tests. The macro has been proposed for addition to uapi headers [1], but the patch hasn't landed yet. Define PKEY_UNRESTRICTED in pkey-helpers.h for the time being to fix the build. [1] https://lore.kernel.org/all/20241028090715.509527-2-yury.khrustalev@arm.com/ Fixes: 6e182dc9f268 ("selftests/mm: Use generic pkey register manipulation") Reported-by: Aishwarya TCV Signed-off-by: Kevin Brodsky Link: https://lore.kernel.org/r/20241107131640.650703-1-kevin.brodsky@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/mm/pkey-helpers.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index 9ab6a3ee153b..f7cfe163b0ff 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -112,6 +112,13 @@ void record_pkey_malloc(void *ptr, long size, int prot); #define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) #endif +/* + * FIXME: Remove once the generic PKEY_UNRESTRICTED definition is merged. + */ +#ifndef PKEY_UNRESTRICTED +#define PKEY_UNRESTRICTED 0x0 +#endif + #ifndef set_pkey_bits static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) { From 1a9de2f6fda69d5f105dd8af776856a66abdaa64 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 27 Aug 2024 13:12:39 +0300 Subject: [PATCH 146/168] acpi/arm64: Adjust error handling procedure in gtdt_parse_timer_block() In case of error in gtdt_parse_timer_block() invalid 'gtdt_frame' will be used in 'do {} while (i-- >= 0 && gtdt_frame--);' statement block because do{} block will be executed even if 'i == 0'. Adjust error handling procedure by replacing 'i-- >= 0' with 'i-- > 0'. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: a712c3ed9b8a ("acpi/arm64: Add memory-mapped timer support in GTDT driver") Signed-off-by: Aleksandr Mishin Acked-by: Hanjun Guo Acked-by: Sudeep Holla Acked-by: Aleksandr Mishin Link: https://lore.kernel.org/r/20240827101239.22020-1-amishin@t-argos.ru Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/gtdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index d7c4e1b9915b..81dc0582743d 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -294,7 +294,7 @@ error: if (frame->virt_irq > 0) acpi_unregister_gsi(gtdt_frame->virtual_timer_interrupt); frame->virt_irq = 0; - } while (i-- >= 0 && gtdt_frame--); + } while (i-- > 0 && gtdt_frame--); return -EINVAL; } From bdf94836c22abc923af5ae83709c96ff4c3c77c9 Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Thu, 19 Sep 2024 12:17:19 +0000 Subject: [PATCH 147/168] arm64: uprobes: Optimize cache flushes for xol slot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The profiling of single-thread selftests bench reveals a bottlenect in caches_clean_inval_pou() on ARM64. On my local testing machine, this function takes approximately 34% of CPU cycles for trig-uprobe-nop and trig-uprobe-push. This patch add a check to avoid unnecessary cache flush when writing instruction to the xol slot. If the instruction is same with the existing instruction in slot, there is no need to synchronize D/I cache. Since xol slot allocation and updates occur on the hot path of uprobe handling, The upstream kernel running on Kunpeng916 (Hi1616), 4 NUMA nodes, 64 cores@ 2.4GHz reveals this optimization has obvious gain for nop and push testcases. Before (next-20240918) ---------------------- uprobe-nop ( 1 cpus): 0.418 ± 0.001M/s ( 0.418M/s/cpu) uprobe-push ( 1 cpus): 0.411 ± 0.005M/s ( 0.411M/s/cpu) uprobe-ret ( 1 cpus): 2.052 ± 0.002M/s ( 2.052M/s/cpu) uretprobe-nop ( 1 cpus): 0.350 ± 0.000M/s ( 0.350M/s/cpu) uretprobe-push ( 1 cpus): 0.353 ± 0.000M/s ( 0.353M/s/cpu) uretprobe-ret ( 1 cpus): 1.074 ± 0.001M/s ( 1.074M/s/cpu) After ----- uprobe-nop ( 1 cpus): 0.926 ± 0.000M/s ( 0.926M/s/cpu) uprobe-push ( 1 cpus): 0.910 ± 0.001M/s ( 0.910M/s/cpu) uprobe-ret ( 1 cpus): 2.056 ± 0.001M/s ( 2.056M/s/cpu) uretprobe-nop ( 1 cpus): 0.653 ± 0.001M/s ( 0.653M/s/cpu) uretprobe-push ( 1 cpus): 0.645 ± 0.000M/s ( 0.645M/s/cpu) uretprobe-ret ( 1 cpus): 1.093 ± 0.001M/s ( 1.093M/s/cpu) Signed-off-by: Liao Chang Link: https://lore.kernel.org/r/20240919121719.2148361-1-liaochang1@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/probes/uprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index d49aef2657cd..ba61651db4b5 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -17,12 +17,20 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *xol_page_kaddr = kmap_atomic(page); void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + /* + * Initial cache maintenance of the xol page done via set_pte_at(). + * Subsequent CMOs only needed if the xol slot changes. + */ + if (!memcmp(dst, src, len)) + goto done; + /* Initialize the slot */ memcpy(dst, src, len); /* flush caches (dcache/icache) */ sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len); +done: kunmap_atomic(xol_page_kaddr); } From ccf54058f5328e6da5ecc61c961bf68bbb7d865b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 6 Nov 2024 19:55:15 +0100 Subject: [PATCH 148/168] arm64/scs: Fix handling of DWARF augmentation data in CIE/FDE frames The dynamic SCS patching code pretends to parse the DWARF augmentation data in the CIE (header) frame, and handle accordingly when processing the individual FDE frames based on this CIE frame. However, the boolean variable is defined inside the loop, and so the parsed value is ignored. The same applies to the code alignment field, which is also read from the header but then discarded. This was never spotted before because Clang is the only compiler that supports dynamic SCS patching (which is essentially an Android feature), and the unwind tables it produces are highly uniform, and match the de facto defaults. So instead of testing for the 'z' flag in the augmentation data field, require a fixed augmentation data string of 'zR', and simplify the rest of the code accordingly. Also introduce some error codes to specify why the patching failed, and log it to the kernel console on failure when this happens when loading a module. (Doing so for vmlinux is infeasible, as the patching is done extremely early in the boot.) Signed-off-by: Ard Biesheuvel Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20241106185513.3096442-6-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/scs.h | 7 ++++ arch/arm64/kernel/module.c | 10 ++++- arch/arm64/kernel/pi/patch-scs.c | 63 ++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 30 deletions(-) diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index 2e010ea76be2..934e9217cd74 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -46,6 +46,13 @@ static inline void dynamic_scs_init(void) static inline void dynamic_scs_init(void) {} #endif +enum { + EDYNSCS_INVALID_CIE_HEADER = 1, + EDYNSCS_INVALID_CIE_SDATA_SIZE = 2, + EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE = 3, + EDYNSCS_INVALID_CFA_OPCODE = 4, +}; + int __pi_scs_patch(const u8 eh_frame[], int size); asmlinkage void __pi_scs_patch_vmlinux(void); diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 36b25af56324..06bb680bfe97 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -462,14 +462,20 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s; + int ret; + s = find_section(hdr, sechdrs, ".altinstructions"); if (s) apply_alternatives_module((void *)s->sh_addr, s->sh_size); if (scs_is_dynamic()) { s = find_section(hdr, sechdrs, ".init.eh_frame"); - if (s) - __pi_scs_patch((void *)s->sh_addr, s->sh_size); + if (s) { + ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size); + if (ret) + pr_err("module %s: error occurred during dynamic SCS patching (%d)\n", + me->name, ret); + } } return module_init_ftrace_plt(hdr, sechdrs, me); diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index 49d8b40e61bc..cec8f0a52bbc 100644 --- a/arch/arm64/kernel/pi/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -120,7 +120,11 @@ struct eh_frame { union { struct { // CIE u8 version; - u8 augmentation_string[]; + u8 augmentation_string[3]; + u8 code_alignment_factor; + u8 data_alignment_factor; + u8 return_address_register; + u8 augmentation_data_size; }; struct { // FDE @@ -132,25 +136,21 @@ struct eh_frame { }; static int scs_handle_fde_frame(const struct eh_frame *frame, - bool fde_has_augmentation_data, int code_alignment_factor, bool dry_run) { int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; u64 loc = (u64)offset_to_ptr(&frame->initial_loc); const u8 *opcode = frame->opcodes; + int l; - if (fde_has_augmentation_data) { - int l; + // assume single byte uleb128_t for augmentation data size + if (*opcode & BIT(7)) + return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE; - // assume single byte uleb128_t - if (WARN_ON(*opcode & BIT(7))) - return -ENOEXEC; - - l = *opcode++; - opcode += l; - size -= l + 1; - } + l = *opcode++; + opcode += l; + size -= l + 1; /* * Starting from 'loc', apply the CFA opcodes that advance the location @@ -201,7 +201,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, break; default: - return -ENOEXEC; + return EDYNSCS_INVALID_CFA_OPCODE; } } return 0; @@ -209,12 +209,11 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, int scs_patch(const u8 eh_frame[], int size) { + int code_alignment_factor = 1; const u8 *p = eh_frame; while (size > 4) { const struct eh_frame *frame = (const void *)p; - bool fde_has_augmentation_data = true; - int code_alignment_factor = 1; int ret; if (frame->size == 0 || @@ -223,28 +222,36 @@ int scs_patch(const u8 eh_frame[], int size) break; if (frame->cie_id_or_pointer == 0) { - const u8 *p = frame->augmentation_string; - - /* a 'z' in the augmentation string must come first */ - fde_has_augmentation_data = *p == 'z'; + /* + * Require presence of augmentation data (z) with a + * specifier for the size of the FDE initial_loc and + * range fields (R), and nothing else. + */ + if (strcmp(frame->augmentation_string, "zR")) + return EDYNSCS_INVALID_CIE_HEADER; /* * The code alignment factor is a uleb128 encoded field * but given that the only sensible values are 1 or 4, - * there is no point in decoding the whole thing. + * there is no point in decoding the whole thing. Also + * sanity check the size of the data alignment factor + * field, and the values of the return address register + * and augmentation data size fields. */ - p += strlen(p) + 1; - if (!WARN_ON(*p & BIT(7))) - code_alignment_factor = *p; + if ((frame->code_alignment_factor & BIT(7)) || + (frame->data_alignment_factor & BIT(7)) || + frame->return_address_register != 30 || + frame->augmentation_data_size != 1) + return EDYNSCS_INVALID_CIE_HEADER; + + code_alignment_factor = frame->code_alignment_factor; } else { - ret = scs_handle_fde_frame(frame, - fde_has_augmentation_data, - code_alignment_factor, + ret = scs_handle_fde_frame(frame, code_alignment_factor, true); if (ret) return ret; - scs_handle_fde_frame(frame, fde_has_augmentation_data, - code_alignment_factor, false); + scs_handle_fde_frame(frame, code_alignment_factor, + false); } p += sizeof(frame->size) + frame->size; From 60de7a647fc51b8467f6fb1a8abfe4cab1c95687 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 6 Nov 2024 19:55:16 +0100 Subject: [PATCH 149/168] arm64/scs: Deal with 64-bit relative offsets in FDE frames In some cases, the compiler may decide to emit DWARF FDE frames with 64-bit signed fields for the code offset and range fields. This may happen when using the large code model, for instance, which permits an executable to be spread out over more than 4 GiB of address space. Whether this is the case can be inferred from the augmentation data in the CIE frame, so decode this data before processing the FDE frames. Signed-off-by: Ard Biesheuvel Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20241106185513.3096442-7-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/pi/patch-scs.c | 34 ++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c index cec8f0a52bbc..55d0cd64ef71 100644 --- a/arch/arm64/kernel/pi/patch-scs.c +++ b/arch/arm64/kernel/pi/patch-scs.c @@ -50,6 +50,10 @@ bool dynamic_scs_is_enabled; #define DW_CFA_GNU_negative_offset_extended 0x2f #define DW_CFA_hi_user 0x3f +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_pcrel 0x10 + enum { PACIASP = 0xd503233f, AUTIASP = 0xd50323bf, @@ -125,6 +129,7 @@ struct eh_frame { u8 data_alignment_factor; u8 return_address_register; u8 augmentation_data_size; + u8 fde_pointer_format; }; struct { // FDE @@ -132,11 +137,18 @@ struct eh_frame { s32 range; u8 opcodes[]; }; + + struct { // FDE + s64 initial_loc64; + s64 range64; + u8 opcodes64[]; + }; }; }; static int scs_handle_fde_frame(const struct eh_frame *frame, int code_alignment_factor, + bool use_sdata8, bool dry_run) { int size = frame->size - offsetof(struct eh_frame, opcodes) + 4; @@ -144,6 +156,12 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, const u8 *opcode = frame->opcodes; int l; + if (use_sdata8) { + loc = (u64)&frame->initial_loc64 + frame->initial_loc64; + opcode = frame->opcodes64; + size -= 8; + } + // assume single byte uleb128_t for augmentation data size if (*opcode & BIT(7)) return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE; @@ -210,6 +228,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame, int scs_patch(const u8 eh_frame[], int size) { int code_alignment_factor = 1; + bool fde_use_sdata8 = false; const u8 *p = eh_frame; while (size > 4) { @@ -245,13 +264,24 @@ int scs_patch(const u8 eh_frame[], int size) return EDYNSCS_INVALID_CIE_HEADER; code_alignment_factor = frame->code_alignment_factor; + + switch (frame->fde_pointer_format) { + case DW_EH_PE_pcrel | DW_EH_PE_sdata4: + fde_use_sdata8 = false; + break; + case DW_EH_PE_pcrel | DW_EH_PE_sdata8: + fde_use_sdata8 = true; + break; + default: + return EDYNSCS_INVALID_CIE_SDATA_SIZE; + } } else { ret = scs_handle_fde_frame(frame, code_alignment_factor, - true); + fde_use_sdata8, true); if (ret) return ret; scs_handle_fde_frame(frame, code_alignment_factor, - false); + fde_use_sdata8, false); } p += sizeof(frame->size) + frame->size; From 47965a49a2c88ea850a30190b5eacd2d50f51a4b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 6 Nov 2024 19:55:17 +0100 Subject: [PATCH 150/168] arm64/scs: Drop unused prototype __pi_scs_patch_vmlinux() The function scs_patch_vmlinux() was removed in the LPA2 boot code refactoring so remove the declaration as well. Signed-off-by: Ard Biesheuvel Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Link: https://lore.kernel.org/r/20241106185513.3096442-8-ardb+git@google.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/scs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h index 934e9217cd74..a76f9b387a26 100644 --- a/arch/arm64/include/asm/scs.h +++ b/arch/arm64/include/asm/scs.h @@ -54,7 +54,6 @@ enum { }; int __pi_scs_patch(const u8 eh_frame[], int size); -asmlinkage void __pi_scs_patch_vmlinux(void); #endif /* __ASSEMBLY __ */ From ae465d9ca192f582cf4932e628a25f9625a8bf83 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 8 Nov 2024 15:20:46 +0000 Subject: [PATCH 151/168] kselftest/arm64: Fix build with stricter assemblers While some assemblers (including the LLVM assembler I mostly use) will happily accept SMSTART as an instruction by default others, specifically gas, require that any architecture extensions be explicitly enabled. The assembler SME test programs use manually encoded helpers for the new instructions but no SMSTART helper is defined, only SM and ZA specific variants. Unfortunately the irritators that were just added use plain SMSTART so on stricter assemblers these fail to build: za-test.S:160: Error: selected processor does not support `smstart' Switch to using SMSTART ZA via the manually encoded smstart_za macro we already have defined. Fixes: d65f27d240bb ("kselftest/arm64: Implement irritators for ZA and ZT") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241108-arm64-selftest-asm-error-v1-1-7ce27b42a677@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/za-test.S | 2 +- tools/testing/selftests/arm64/fp/zt-test.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S index f902e6ef9077..5abcb8f009f8 100644 --- a/tools/testing/selftests/arm64/fp/za-test.S +++ b/tools/testing/selftests/arm64/fp/za-test.S @@ -157,7 +157,7 @@ function irritator_handler // This will reset ZA to all bits 0 smstop - smstart + smstart_za ret endfunction diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S index c96cb7c2ad4b..7b9de8d2a873 100644 --- a/tools/testing/selftests/arm64/fp/zt-test.S +++ b/tools/testing/selftests/arm64/fp/zt-test.S @@ -126,7 +126,7 @@ function irritator_handler // This will reset ZT to all bits 0 smstop - smstart + smstart_za ret endfunction From b6bd50dd3b564d50b8cd748de6bae58804ecb768 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 8 Nov 2024 13:49:17 +0000 Subject: [PATCH 152/168] kselftest/arm64: Fix printf() compiler warnings in the arm64 fp tests Lots of incorrect length modifiers, missing arguments or conversion specifiers. Fix them. Cc: Shuah Khan Cc: Mark Brown Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20241108134920.1233992-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/sve-ptrace.c | 16 +++++++++------- tools/testing/selftests/arm64/fp/za-ptrace.c | 8 +++++--- tools/testing/selftests/arm64/fp/zt-ptrace.c | 8 +++++--- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index 6d61992fe8a0..577b6e05e860 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -82,10 +82,12 @@ static void fill_buf(char *buf, size_t size) static int do_child(void) { if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) - ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno)); + ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)\n", + strerror(errno), errno); if (raise(SIGSTOP)) - ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno)); + ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n", + strerror(errno), errno); return EXIT_SUCCESS; } @@ -340,7 +342,7 @@ static void ptrace_set_sve_get_sve_data(pid_t child, data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE); write_buf = malloc(data_size); if (!write_buf) { - ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n", + ksft_test_result_fail("Error allocating %ld byte buffer for %s VL %u\n", data_size, type->name, vl); return; } @@ -441,7 +443,7 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE); write_buf = malloc(data_size); if (!write_buf) { - ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n", + ksft_test_result_fail("Error allocating %ld byte buffer for %s VL %u\n", data_size, type->name, vl); return; } @@ -545,7 +547,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child, read_sve = read_buf; if (read_sve->vl != vl) { - ksft_test_result_fail("Child VL != expected VL %d\n", + ksft_test_result_fail("Child VL != expected VL: %u != %u\n", read_sve->vl, vl); goto out; } @@ -555,7 +557,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child, case SVE_PT_REGS_FPSIMD: expected_size = SVE_PT_FPSIMD_SIZE(vq, SVE_PT_REGS_FPSIMD); if (read_sve_size < expected_size) { - ksft_test_result_fail("Read %d bytes, expected %d\n", + ksft_test_result_fail("Read %ld bytes, expected %ld\n", read_sve_size, expected_size); goto out; } @@ -571,7 +573,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child, case SVE_PT_REGS_SVE: expected_size = SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE); if (read_sve_size < expected_size) { - ksft_test_result_fail("Read %d bytes, expected %d\n", + ksft_test_result_fail("Read %ld bytes, expected %ld\n", read_sve_size, expected_size); goto out; } diff --git a/tools/testing/selftests/arm64/fp/za-ptrace.c b/tools/testing/selftests/arm64/fp/za-ptrace.c index ac27d87396fc..08c777f87ea2 100644 --- a/tools/testing/selftests/arm64/fp/za-ptrace.c +++ b/tools/testing/selftests/arm64/fp/za-ptrace.c @@ -48,10 +48,12 @@ static void fill_buf(char *buf, size_t size) static int do_child(void) { if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) - ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno)); + ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)", + strerror(errno), errno); if (raise(SIGSTOP)) - ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno)); + ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n", + strerror(errno), errno); return EXIT_SUCCESS; } @@ -201,7 +203,7 @@ static void ptrace_set_get_data(pid_t child, unsigned int vl) data_size = ZA_PT_SIZE(vq); write_buf = malloc(data_size); if (!write_buf) { - ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n", + ksft_test_result_fail("Error allocating %ld byte buffer for VL %u\n", data_size, vl); return; } diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c index 996d9614a131..584b8d59b7ea 100644 --- a/tools/testing/selftests/arm64/fp/zt-ptrace.c +++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c @@ -43,10 +43,12 @@ static void fill_buf(char *buf, size_t size) static int do_child(void) { if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) - ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno)); + ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)\n", + strerror(errno), errno); if (raise(SIGSTOP)) - ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno)); + ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n", + strerror(errno), errno); return EXIT_SUCCESS; } @@ -231,7 +233,7 @@ static void ptrace_enable_za_via_zt(pid_t child) /* Should have register data */ if (za_out->size < ZA_PT_SIZE(vq)) { ksft_print_msg("ZA data less than expected: %u < %u\n", - za_out->size, ZA_PT_SIZE(vq)); + za_out->size, (unsigned int)ZA_PT_SIZE(vq)); fail = true; vq = 0; } From 0cc6b94a445c53ab152554b4cf60575e1396adf6 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 8 Nov 2024 13:49:18 +0000 Subject: [PATCH 153/168] kselftest/arm64: Fix printf() warning in the arm64 MTE prctl() test While prctl() returns an 'int', the PR_MTE_TCF_MASK is defined as unsigned long which results in the larger type following a bitwise 'and' operation. Cast the printf() argument to 'int'. Cc: Shuah Khan Cc: Mark Brown Link: https://lore.kernel.org/r/20241108134920.1233992-3-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/mte/check_prctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/mte/check_prctl.c b/tools/testing/selftests/arm64/mte/check_prctl.c index f139a33a43ef..4c89e9538ca0 100644 --- a/tools/testing/selftests/arm64/mte/check_prctl.c +++ b/tools/testing/selftests/arm64/mte/check_prctl.c @@ -85,7 +85,7 @@ void set_mode_test(const char *name, int hwcap2, int mask) ksft_test_result_pass("%s\n", name); } else { ksft_print_msg("Got %x, expected %x\n", - (ret & PR_MTE_TCF_MASK), mask); + (ret & (int)PR_MTE_TCF_MASK), mask); ksft_test_result_fail("%s\n", name); } } From 694e2803fece8d066bd85ce8607c630ce2b69859 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 8 Nov 2024 13:49:19 +0000 Subject: [PATCH 154/168] kselftest/arm64: Fix printf() compiler warnings in the arm64 syscall-abi.c tests Fix the incorrect length modifiers in arm64/abi/syscall-abi.c. Cc: Shuah Khan Cc: Mark Brown Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20241108134920.1233992-4-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/abi/syscall-abi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c index d704511a0955..5ec9a18ec802 100644 --- a/tools/testing/selftests/arm64/abi/syscall-abi.c +++ b/tools/testing/selftests/arm64/abi/syscall-abi.c @@ -81,7 +81,7 @@ static int check_gpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl, uint64_t s */ for (i = 9; i < ARRAY_SIZE(gpr_in); i++) { if (gpr_in[i] != gpr_out[i]) { - ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %llx != %llx\n", + ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %lx != %lx\n", cfg->name, sve_vl, i, gpr_in[i], gpr_out[i]); errors++; @@ -112,7 +112,7 @@ static int check_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl, if (!sve_vl && !(svcr & SVCR_SM_MASK)) { for (i = 0; i < ARRAY_SIZE(fpr_in); i++) { if (fpr_in[i] != fpr_out[i]) { - ksft_print_msg("%s Q%d/%d mismatch %llx != %llx\n", + ksft_print_msg("%s Q%d/%d mismatch %lx != %lx\n", cfg->name, i / 2, i % 2, fpr_in[i], fpr_out[i]); @@ -294,13 +294,13 @@ static int check_svcr(struct syscall_cfg *cfg, int sve_vl, int sme_vl, int errors = 0; if (svcr_out & SVCR_SM_MASK) { - ksft_print_msg("%s Still in SM, SVCR %llx\n", + ksft_print_msg("%s Still in SM, SVCR %lx\n", cfg->name, svcr_out); errors++; } if ((svcr_in & SVCR_ZA_MASK) != (svcr_out & SVCR_ZA_MASK)) { - ksft_print_msg("%s PSTATE.ZA changed, SVCR %llx != %llx\n", + ksft_print_msg("%s PSTATE.ZA changed, SVCR %lx != %lx\n", cfg->name, svcr_in, svcr_out); errors++; } From 929bbc16abfb0144db7ac619c77f60b188e555ab Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 8 Nov 2024 11:05:49 +0000 Subject: [PATCH 155/168] selftests/mm: Fix unused function warning for aarch64_write_signal_pkey() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 49f59573e9e0 ("selftests/mm: Enable pkey_sighandler_tests on arm64"), pkey_sighandler_tests.c (which includes pkey-arm64.h via pkey-helpers.h) ends up compiled for arm64. Since it doesn't use aarch64_write_signal_pkey(), the compiler warns: In file included from pkey-helpers.h:106, from pkey_sighandler_tests.c:31: pkey-arm64.h:130:13: warning: ‘aarch64_write_signal_pkey’ defined but not used [-Wunused-function] 130 | static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey) | ^~~~~~~~~~~~~~~~~~~~~~~~~ Make the aarch64_write_signal_pkey() a 'static inline void' function to avoid the compiler warning. Fixes: f5b5ea51f78f ("selftests: mm: make protection_keys test work on arm64") Cc: Shuah Khan Cc: Joey Gouly Cc: Kevin Brodsky Reviewed-by: Kevin Brodsky Link: https://lore.kernel.org/r/20241108110549.1185923-1-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- tools/testing/selftests/mm/pkey-arm64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h index d57fbeace38f..d9d2100eafc0 100644 --- a/tools/testing/selftests/mm/pkey-arm64.h +++ b/tools/testing/selftests/mm/pkey-arm64.h @@ -127,7 +127,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey) return 0; } -static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey) +static inline void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey) { struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt); struct poe_context *poe_ctx = From 116e50d6474e82579086c0397d2fa3999815f29e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 6 Nov 2024 17:07:51 +0000 Subject: [PATCH 156/168] kselftest/arm64: Check that SVCR is 0 in signal handlers We don't currently validate that we exit streaming mode and clear ZA when we enter a signal handler. Add simple checks for this in the SSVE and ZA tests. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241106-arm64-fpmr-signal-test-v1-1-31fa34ce58fe@kernel.org [catalin.marinas@arm.com: Use %lx in fprintf() as uint64_t seems to be unsigned long in glibc] Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/signal/sve_helpers.h | 13 +++++++++++++ .../selftests/arm64/signal/testcases/ssve_regs.c | 5 +++++ .../selftests/arm64/signal/testcases/za_regs.c | 5 +++++ 3 files changed, 23 insertions(+) diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.h b/tools/testing/selftests/arm64/signal/sve_helpers.h index 50948ce471cc..ca133b93375f 100644 --- a/tools/testing/selftests/arm64/signal/sve_helpers.h +++ b/tools/testing/selftests/arm64/signal/sve_helpers.h @@ -18,4 +18,17 @@ extern unsigned int nvls; int sve_fill_vls(bool use_sme, int min_vls); +static inline uint64_t get_svcr(void) +{ + uint64_t val; + + asm volatile ( + "mrs %0, S3_3_C4_C2_2\n" + : "=r"(val) + : + : "cc"); + + return val; +} + #endif diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c index 6dbe48cf8b09..1dbca9afb13c 100644 --- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c @@ -85,6 +85,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc, fprintf(stderr, "Got expected size %u and VL %d\n", head->size, ssve->vl); + if (get_svcr() != 0) { + fprintf(stderr, "Unexpected SVCR %lx\n", get_svcr()); + return 1; + } + return 0; } diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c index b9e13f27f1f9..badaead5326a 100644 --- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c +++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c @@ -91,6 +91,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc, return 1; } + if (get_svcr() != 0) { + fprintf(stderr, "Unexpected SVCR %lx\n", get_svcr()); + return 1; + } + return 0; } From c297aa7d3fb6755890b78b483e82c9cf07370d50 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 11 Nov 2024 18:32:58 +0000 Subject: [PATCH 157/168] kselftest/arm64: Enable build of PAC tests with LLVM=1 Currently we don't build the PAC selftests when building with LLVM=1 since we attempt to test for PAC support in the toolchain before we've set up the build system to point at LLVM in lib.mk, which has to be one of the last things in the Makefile. Since all versions of LLVM supported for use with the kernel have PAC support we can just sidestep the issue by just assuming PAC is there when doing a LLVM=1 build. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241111-arm64-selftest-pac-clang-v1-1-08599ceee418@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/pauth/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/arm64/pauth/Makefile b/tools/testing/selftests/arm64/pauth/Makefile index 72e290b0b10c..b5a1c80e0ead 100644 --- a/tools/testing/selftests/arm64/pauth/Makefile +++ b/tools/testing/selftests/arm64/pauth/Makefile @@ -7,8 +7,14 @@ CC := $(CROSS_COMPILE)gcc endif CFLAGS += -mbranch-protection=pac-ret + +# All supported LLVMs have PAC, test for GCC +ifeq ($(LLVM),1) +pauth_cc_support := 1 +else # check if the compiler supports ARMv8.3 and branch protection with PAuth pauth_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.3-a -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi) +endif ifeq ($(pauth_cc_support),1) TEST_GEN_PROGS := pac From c0350076c13eac4f1d7f7ab6acd43bb252baef7a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 12 Nov 2024 13:08:14 +0000 Subject: [PATCH 158/168] kselftets/arm64: Use flag bits for features in fp-ptrace assembler code The assembler portions of fp-ptrace are passed feature flags by the C code indicating which architectural features are supported. Currently these use an entire register for each flag which is wasteful and gets cumbersome as new flags are added. Switch to using flag bits in a single register to make things easier to maintain. No functional change. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-1-250b57c61254@kernel.org Signed-off-by: Catalin Marinas --- .../selftests/arm64/fp/fp-ptrace-asm.S | 32 +++++++++++-------- tools/testing/selftests/arm64/fp/fp-ptrace.c | 17 +++++++--- tools/testing/selftests/arm64/fp/fp-ptrace.h | 10 ++++++ 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S index 7ad59d92d02b..5e7e9c878f2c 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S +++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S @@ -15,10 +15,7 @@ // Load and save register values with pauses for ptrace // -// x0 - SVE in use -// x1 - SME in use -// x2 - SME2 in use -// x3 - FA64 supported +// x0 - HAVE_ flags indicating which features are in use .globl load_and_save load_and_save: @@ -44,7 +41,7 @@ load_and_save: ldp q30, q31, [x7, #16 * 30] // SME? - cbz x1, check_sve_in + tbz x0, #HAVE_SME_SHIFT, check_sve_in adrp x7, svcr_in ldr x7, [x7, :lo12:svcr_in] @@ -64,7 +61,7 @@ load_and_save: bne 1b // ZT? - cbz x2, check_sm_in + tbz x0, #HAVE_SME2_SHIFT, check_sm_in adrp x6, zt_in add x6, x6, :lo12:zt_in _ldr_zt 6 @@ -72,12 +69,16 @@ load_and_save: // In streaming mode? check_sm_in: tbz x7, #SVCR_SM_SHIFT, check_sve_in - mov x4, x3 // Load FFR if we have FA64 + + // Load FFR if we have FA64 + mov x4, #0 + tbz x0, #HAVE_FA64_SHIFT, load_sve + mov x4, #1 b load_sve // SVE? check_sve_in: - cbz x0, wait_for_writes + tbz x0, #HAVE_SVE_SHIFT, wait_for_writes mov x4, #1 load_sve: @@ -165,8 +166,7 @@ wait_for_writes: stp q28, q29, [x7, #16 * 28] stp q30, q31, [x7, #16 * 30] - // SME? - cbz x1, check_sve_out + tbz x0, #HAVE_SME_SHIFT, check_sve_out rdsvl 11, 1 adrp x6, sme_vl_out @@ -187,7 +187,7 @@ wait_for_writes: bne 1b // ZT? - cbz x2, check_sm_out + tbz x0, #HAVE_SME2_SHIFT, check_sm_out adrp x6, zt_out add x6, x6, :lo12:zt_out _str_zt 6 @@ -195,12 +195,16 @@ wait_for_writes: // In streaming mode? check_sm_out: tbz x7, #SVCR_SM_SHIFT, check_sve_out - mov x4, x3 // FFR? + + // Do we have FA64 and FFR? + mov x4, #0 + tbz x0, #HAVE_FA64_SHIFT, read_sve + mov x4, #1 b read_sve // SVE? check_sve_out: - cbz x0, wait_for_reads + tbz x0, #HAVE_SVE_SHIFT, wait_for_reads mov x4, #1 rdvl x7, #1 @@ -271,7 +275,7 @@ wait_for_reads: brk #0 // Ensure we don't leave ourselves in streaming mode - cbz x1, out + tbz x0, #HAVE_SME_SHIFT, out msr S3_3_C4_C2_2, xzr out: diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index c7ceafe5f471..d96af27487fa 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -82,7 +82,7 @@ uint64_t sve_vl_out; uint64_t sme_vl_out; uint64_t svcr_in, svcr_expected, svcr_out; -void load_and_save(int sve, int sme, int sme2, int fa64); +void load_and_save(int flags); static bool got_alarm; @@ -198,7 +198,7 @@ static int vl_expected(struct test_config *config) static void run_child(struct test_config *config) { - int ret; + int ret, flags; /* Let the parent attach to us */ ret = ptrace(PTRACE_TRACEME, 0, 0, 0); @@ -224,8 +224,17 @@ static void run_child(struct test_config *config) } /* Load values and wait for the parent */ - load_and_save(sve_supported(), sme_supported(), - sme2_supported(), fa64_supported()); + flags = 0; + if (sve_supported()) + flags |= HAVE_SVE; + if (sme_supported()) + flags |= HAVE_SME; + if (sme2_supported()) + flags |= HAVE_SME2; + if (fa64_supported()) + flags |= HAVE_FA64; + + load_and_save(flags); exit(0); } diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h index db4f2c4d750c..36ca627e1980 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.h +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h @@ -10,4 +10,14 @@ #define SVCR_SM (1 << SVCR_SM_SHIFT) #define SVCR_ZA (1 << SVCR_ZA_SHIFT) +#define HAVE_SVE_SHIFT 0 +#define HAVE_SME_SHIFT 1 +#define HAVE_SME2_SHIFT 2 +#define HAVE_FA64_SHIFT 3 + +#define HAVE_SVE (1 << HAVE_SVE_SHIFT) +#define HAVE_SME (1 << HAVE_SME_SHIFT) +#define HAVE_SME2 (1 << HAVE_SME2_SHIFT) +#define HAVE_FA64 (1 << HAVE_FA64_SHIFT) + #endif From 7e9c5b00009a625cc304c865192978c01c0cc077 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 12 Nov 2024 13:08:15 +0000 Subject: [PATCH 159/168] kselftest/arm64: Expand the set of ZA writes fp-ptrace does Currently our test for implementable ZA writes is written in a bit of a convoluted fashion which excludes all changes where we clear SVCR.SM even though we can actually support that since changing the vector length resets SVCR. Make the logic more direct, enabling us to actually run these cases. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-2-250b57c61254@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fp-ptrace.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index d96af27487fa..56cf6e02c535 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -1078,21 +1078,19 @@ static void sve_write(pid_t child, struct test_config *config) static bool za_write_supported(struct test_config *config) { - if (config->svcr_expected & SVCR_SM) { - if (!(config->svcr_in & SVCR_SM)) + if (config->sme_vl_in != config->sme_vl_expected) { + /* Changing the SME VL exits streaming mode. */ + if (config->svcr_expected & SVCR_SM) { return false; - - /* Changing the SME VL exits streaming mode */ - if (config->sme_vl_in != config->sme_vl_expected) { + } + } else { + /* Otherwise we can't change streaming mode */ + if ((config->svcr_in & SVCR_SM) != + (config->svcr_expected & SVCR_SM)) { return false; } } - /* Can't disable SM outside a VL change */ - if ((config->svcr_in & SVCR_SM) && - !(config->svcr_expected & SVCR_SM)) - return false; - return true; } From 7dbd26d0b22d69d36ab3e76ee7f152482a19cbed Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 12 Nov 2024 13:08:16 +0000 Subject: [PATCH 160/168] kselftest/arm64: Add FPMR coverage to fp-ptrace Add coverage for FPMR to fp-ptrace. FPMR can be available independently of SVE and SME, if SME is supported then FPMR is cleared by entering and exiting streaming mode. As with other registers we generate random values to load into the register, we restrict these to bitfields which are always defined. We also leave bitfields where the valid values are affected by the set of supported FP8 formats zero to reduce complexity, it is unlikely that specific bitfields will be affected by ptrace issues. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-3-250b57c61254@kernel.org [catalin.marinas@arm.com: use REG_FPMR instead of FPMR] Signed-off-by: Catalin Marinas --- .../selftests/arm64/fp/fp-ptrace-asm.S | 23 +++- tools/testing/selftests/arm64/fp/fp-ptrace.c | 126 ++++++++++++++++++ tools/testing/selftests/arm64/fp/fp-ptrace.h | 2 + tools/testing/selftests/arm64/fp/sme-inst.h | 2 + 4 files changed, 146 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S index 5e7e9c878f2c..82c3ab70e1cf 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S +++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S @@ -71,14 +71,12 @@ check_sm_in: tbz x7, #SVCR_SM_SHIFT, check_sve_in // Load FFR if we have FA64 - mov x4, #0 - tbz x0, #HAVE_FA64_SHIFT, load_sve - mov x4, #1 + ubfx x4, x0, #HAVE_FA64_SHIFT, #1 b load_sve // SVE? check_sve_in: - tbz x0, #HAVE_SVE_SHIFT, wait_for_writes + tbz x0, #HAVE_SVE_SHIFT, check_fpmr_in mov x4, #1 load_sve: @@ -143,6 +141,13 @@ load_sve: ldr p14, [x7, #14, MUL VL] ldr p15, [x7, #15, MUL VL] + // This has to come after we set PSTATE.SM +check_fpmr_in: + tbz x0, #HAVE_FPMR_SHIFT, wait_for_writes + adrp x7, fpmr_in + ldr x7, [x7, :lo12:fpmr_in] + msr REG_FPMR, x7 + wait_for_writes: // Wait for the parent brk #0 @@ -166,6 +171,12 @@ wait_for_writes: stp q28, q29, [x7, #16 * 28] stp q30, q31, [x7, #16 * 30] + tbz x0, #HAVE_FPMR_SHIFT, check_sme_out + mrs x7, REG_FPMR + adrp x6, fpmr_out + str x7, [x6, :lo12:fpmr_out] + +check_sme_out: tbz x0, #HAVE_SME_SHIFT, check_sve_out rdsvl 11, 1 @@ -197,9 +208,7 @@ check_sm_out: tbz x7, #SVCR_SM_SHIFT, check_sve_out // Do we have FA64 and FFR? - mov x4, #0 - tbz x0, #HAVE_FA64_SHIFT, read_sve - mov x4, #1 + ubfx x4, x0, #HAVE_FA64_SHIFT, #1 b read_sve // SVE? diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c index 56cf6e02c535..4930e03a7b99 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.c +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c @@ -31,6 +31,14 @@ #include "fp-ptrace.h" +#include + +#define FPMR_LSCALE2_MASK GENMASK(37, 32) +#define FPMR_NSCALE_MASK GENMASK(31, 24) +#define FPMR_LSCALE_MASK GENMASK(22, 16) +#define FPMR_OSC_MASK GENMASK(15, 15) +#define FPMR_OSM_MASK GENMASK(14, 14) + /* and don't like each other, so: */ #ifndef NT_ARM_SVE #define NT_ARM_SVE 0x405 @@ -48,11 +56,22 @@ #define NT_ARM_ZT 0x40d #endif +#ifndef NT_ARM_FPMR +#define NT_ARM_FPMR 0x40e +#endif + #define ARCH_VQ_MAX 256 /* VL 128..2048 in powers of 2 */ #define MAX_NUM_VLS 5 +/* + * FPMR bits we can set without doing feature checks to see if values + * are valid. + */ +#define FPMR_SAFE_BITS (FPMR_LSCALE2_MASK | FPMR_NSCALE_MASK | \ + FPMR_LSCALE_MASK | FPMR_OSC_MASK | FPMR_OSM_MASK) + #define NUM_FPR 32 __uint128_t v_in[NUM_FPR]; __uint128_t v_expected[NUM_FPR]; @@ -78,6 +97,8 @@ char zt_in[ZT_SIG_REG_BYTES]; char zt_expected[ZT_SIG_REG_BYTES]; char zt_out[ZT_SIG_REG_BYTES]; +uint64_t fpmr_in, fpmr_expected, fpmr_out; + uint64_t sve_vl_out; uint64_t sme_vl_out; uint64_t svcr_in, svcr_expected, svcr_out; @@ -128,6 +149,11 @@ static bool fa64_supported(void) return getauxval(AT_HWCAP2) & HWCAP2_SME_FA64; } +static bool fpmr_supported(void) +{ + return getauxval(AT_HWCAP2) & HWCAP2_FPMR; +} + static bool compare_buffer(const char *name, void *out, void *expected, size_t size) { @@ -233,6 +259,8 @@ static void run_child(struct test_config *config) flags |= HAVE_SME2; if (fa64_supported()) flags |= HAVE_FA64; + if (fpmr_supported()) + flags |= HAVE_FPMR; load_and_save(flags); @@ -321,6 +349,14 @@ static void read_child_regs(pid_t child) iov_child.iov_len = sizeof(zt_out); read_one_child_regs(child, "ZT", &iov_parent, &iov_child); } + + if (fpmr_supported()) { + iov_parent.iov_base = &fpmr_out; + iov_parent.iov_len = sizeof(fpmr_out); + iov_child.iov_base = &fpmr_out; + iov_child.iov_len = sizeof(fpmr_out); + read_one_child_regs(child, "FPMR", &iov_parent, &iov_child); + } } static bool continue_breakpoint(pid_t child, @@ -595,6 +631,26 @@ static bool check_ptrace_values_zt(pid_t child, struct test_config *config) return compare_buffer("initial ZT", buf, zt_in, ZT_SIG_REG_BYTES); } +static bool check_ptrace_values_fpmr(pid_t child, struct test_config *config) +{ + uint64_t val; + struct iovec iov; + int ret; + + if (!fpmr_supported()) + return true; + + iov.iov_base = &val; + iov.iov_len = sizeof(val); + ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_FPMR, &iov); + if (ret != 0) { + ksft_print_msg("Failed to read initial FPMR: %s (%d)\n", + strerror(errno), errno); + return false; + } + + return compare_buffer("initial FPMR", &val, &fpmr_in, sizeof(val)); +} static bool check_ptrace_values(pid_t child, struct test_config *config) { @@ -629,6 +685,9 @@ static bool check_ptrace_values(pid_t child, struct test_config *config) if (!check_ptrace_values_zt(child, config)) pass = false; + if (!check_ptrace_values_fpmr(child, config)) + pass = false; + return pass; } @@ -832,11 +891,18 @@ static void set_initial_values(struct test_config *config) { int vq = __sve_vq_from_vl(vl_in(config)); int sme_vq = __sve_vq_from_vl(config->sme_vl_in); + bool sm_change; svcr_in = config->svcr_in; svcr_expected = config->svcr_expected; svcr_out = 0; + if (sme_supported() && + (svcr_in & SVCR_SM) != (svcr_expected & SVCR_SM)) + sm_change = true; + else + sm_change = false; + fill_random(&v_in, sizeof(v_in)); memcpy(v_expected, v_in, sizeof(v_in)); memset(v_out, 0, sizeof(v_out)); @@ -883,6 +949,21 @@ static void set_initial_values(struct test_config *config) memset(zt_expected, 0, ZT_SIG_REG_BYTES); memset(zt_out, 0, sizeof(zt_out)); } + + if (fpmr_supported()) { + fill_random(&fpmr_in, sizeof(fpmr_in)); + fpmr_in &= FPMR_SAFE_BITS; + + /* Entering or exiting streaming mode clears FPMR */ + if (sm_change) + fpmr_expected = 0; + else + fpmr_expected = fpmr_in; + } else { + fpmr_in = 0; + fpmr_expected = 0; + fpmr_out = 0; + } } static bool check_memory_values(struct test_config *config) @@ -933,6 +1014,12 @@ static bool check_memory_values(struct test_config *config) if (!compare_buffer("saved ZT", zt_out, zt_expected, ZT_SIG_REG_BYTES)) pass = false; + if (fpmr_out != fpmr_expected) { + ksft_print_msg("Mismatch in saved FPMR: %lx != %lx\n", + fpmr_out, fpmr_expected); + pass = false; + } + return pass; } @@ -1010,6 +1097,36 @@ static void fpsimd_write(pid_t child, struct test_config *test_config) strerror(errno), errno); } +static bool fpmr_write_supported(struct test_config *config) +{ + if (!fpmr_supported()) + return false; + + if (!sve_sme_same(config)) + return false; + + return true; +} + +static void fpmr_write_expected(struct test_config *config) +{ + fill_random(&fpmr_expected, sizeof(fpmr_expected)); + fpmr_expected &= FPMR_SAFE_BITS; +} + +static void fpmr_write(pid_t child, struct test_config *config) +{ + struct iovec iov; + int ret; + + iov.iov_len = sizeof(fpmr_expected); + iov.iov_base = &fpmr_expected; + ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_FPMR, &iov); + if (ret != 0) + ksft_print_msg("Failed to write FPMR: %s (%d)\n", + strerror(errno), errno); +} + static void sve_write_expected(struct test_config *config) { int vl = vl_expected(config); @@ -1266,6 +1383,12 @@ static struct test_definition base_test_defs[] = { .set_expected_values = fpsimd_write_expected, .modify_values = fpsimd_write, }, + { + .name = "FPMR write", + .supported = fpmr_write_supported, + .set_expected_values = fpmr_write_expected, + .modify_values = fpmr_write, + }, }; static struct test_definition sve_test_defs[] = { @@ -1475,6 +1598,9 @@ int main(void) if (fa64_supported()) ksft_print_msg("FA64 supported\n"); + if (fpmr_supported()) + ksft_print_msg("FPMR supported\n"); + ksft_set_plan(tests); /* Get signal handers ready before we start any children */ diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h index 36ca627e1980..c06919aaf1f7 100644 --- a/tools/testing/selftests/arm64/fp/fp-ptrace.h +++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h @@ -14,10 +14,12 @@ #define HAVE_SME_SHIFT 1 #define HAVE_SME2_SHIFT 2 #define HAVE_FA64_SHIFT 3 +#define HAVE_FPMR_SHIFT 4 #define HAVE_SVE (1 << HAVE_SVE_SHIFT) #define HAVE_SME (1 << HAVE_SME_SHIFT) #define HAVE_SME2 (1 << HAVE_SME2_SHIFT) #define HAVE_FA64 (1 << HAVE_FA64_SHIFT) +#define HAVE_FPMR (1 << HAVE_FPMR_SHIFT) #endif diff --git a/tools/testing/selftests/arm64/fp/sme-inst.h b/tools/testing/selftests/arm64/fp/sme-inst.h index 9292bba5400b..85b9184e0835 100644 --- a/tools/testing/selftests/arm64/fp/sme-inst.h +++ b/tools/testing/selftests/arm64/fp/sme-inst.h @@ -5,6 +5,8 @@ #ifndef SME_INST_H #define SME_INST_H +#define REG_FPMR S3_3_C4_C4_2 + /* * RDSVL X\nx, #\imm */ From 016d659e62ad9ddda1b6899468d0d0798ed71a4d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 12 Nov 2024 14:35:05 +0000 Subject: [PATCH 161/168] kselftest/arm64: Fix missing printf() argument in gcs/gcs-stress.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling the child_cleanup() function results in: gcs-stress.c: In function ‘child_cleanup’: gcs-stress.c:266:75: warning: format ‘%d’ expects a matching ‘int’ argument [-Wformat=] 266 | ksft_print_msg("%s: Exited due to signal %d\n", | ~^ | | | int Add the missing child->exit_signal argument. Fixes: 05e6cfff58c4 ("kselftest/arm64: Add a GCS stress test") Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/gcs/gcs-stress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c index 03222c36c436..bbc7f4950c13 100644 --- a/tools/testing/selftests/arm64/gcs/gcs-stress.c +++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c @@ -264,7 +264,7 @@ static void child_cleanup(struct child_data *child) if (WIFSIGNALED(status)) { child->exit_signal = WTERMSIG(status); ksft_print_msg("%s: Exited due to signal %d\n", - child->name); + child->name, child->exit_signal); fail = true; child->exited = true; } From de7fb8d3a2c913e36d78d762e9fbdd754d8dc749 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Nov 2024 13:22:49 +0530 Subject: [PATCH 162/168] arm64/mm: Change protval as 'pteval_t' in map_range() pgprot_t has been defined as an encapsulated structure with pteval_t as its element. Hence it is prudent to use pteval_t as the type instead of via the size based u64. Besides pteval_t type might be different size later on with FEAT_D128. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20241111075249.609493-1-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/pi/map_range.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c index 5410b2cac590..2b69e3beeef8 100644 --- a/arch/arm64/kernel/pi/map_range.c +++ b/arch/arm64/kernel/pi/map_range.c @@ -30,7 +30,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, u64 va_offset) { u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX; - u64 protval = pgprot_val(prot) & ~PTE_TYPE_MASK; + pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK; int lshift = (3 - level) * (PAGE_SHIFT - 3); u64 lmask = (PAGE_SIZE << lshift) - 1; From f95382d73ec8bfee2b4a00b2ac4aa4530f7c4af3 Mon Sep 17 00:00:00 2001 From: Min-Hua Chen Date: Wed, 18 Sep 2024 07:38:24 +0800 Subject: [PATCH 163/168] acpi/arm64: remove unnecessary cast DEFINE_RES_IRQ returns struct resource type, so it is unnecessary to cast it to struct resource. Remove the unnecessary cast to fix the following sparse warnings: drivers/acpi/arm64/gtdt.c:355:19: sparse: warning: cast to non-scalar drivers/acpi/arm64/gtdt.c:355:19: sparse: warning: cast from non-scalar No functional changes intended. Signed-off-by: Min-Hua Chen Acked-by: Hanjun Guo Reviewed-by: Hanjun Guo Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240917233827.73167-1-minhuadotchen@gmail.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/gtdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index 81dc0582743d..3561553eff8b 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -363,7 +363,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd, } irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags); - res[2] = (struct resource)DEFINE_RES_IRQ(irq); + res[2] = DEFINE_RES_IRQ(irq); if (irq <= 0) { pr_warn("failed to map the Watchdog interrupt.\n"); nr_res--; From 3e360ef0c0a1fb6ce9a302e40b8057c41ba8a9d2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 7 Nov 2024 01:39:22 +0000 Subject: [PATCH 164/168] kselftest/arm64: Corrupt P0 in the irritator when testing SSVE When building for streaming SVE the irritator for SVE skips updates of both P0 and FFR. While FFR is skipped since it might not be present there is no reason to skip corrupting P0 so switch to an instruction valid in streaming mode and move the ifdef. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-3-c4b9622e36ee@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/sve-test.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index f1fb9745c681..28eb8b5cc2d2 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -302,9 +302,9 @@ function irritator_handler movi v0.8b, #1 movi v9.16b, #2 movi v31.8b, #3 -#ifndef SSVE // And P0 - rdffr p0.b + ptrue p0.d +#ifndef SSVE // And FFR wrffr p15.b #endif From c0139f6cbb1fc6438509467ec0455a200cc49a43 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 6 Nov 2024 17:41:32 +0000 Subject: [PATCH 165/168] arm64/ptrace: Clarify documentation of VL configuration via ptrace When we configure SVE, SSVE or ZA via ptrace we allow the user to configure the vector length and specify any of the flags that are accepted when configuring via prctl(). This includes the S[VM]E_SET_VL_ONEXEC flag which defers the configuration of the VL until an exec(). We don't do anything to limit the provision of register data as part of configuring the _ONEXEC VL but as a function of the VL enumeration support we do this will be interpreted using the vector length currently configured for the process. This is all a bit surprising, and probably we should just not have allowed register data to be specified with _ONEXEC, but it's our ABI so let's add some explicit documentation in both the ABI documents and the source calling out what happens. The comments are also missing the fact that since SME does not have a mandatory 128 bit VL it is possible for VL enumeration to result in the configuration of a higher VL than was requested, cover that too. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241106-arm64-sve-ptrace-vl-set-v1-1-3b164e8b559c@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arch/arm64/sme.rst | 4 ++++ Documentation/arch/arm64/sve.rst | 4 ++++ arch/arm64/kernel/ptrace.c | 12 ++++++++++-- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst index be317d457417..b2fa01f85cb5 100644 --- a/Documentation/arch/arm64/sme.rst +++ b/Documentation/arch/arm64/sme.rst @@ -346,6 +346,10 @@ The regset data starts with struct user_za_header, containing: * Writes to NT_ARM_ZT will set PSTATE.ZA to 1. +* If any register data is provided along with SME_PT_VL_ONEXEC then the + registers data will be interpreted with the current vector length, not + the vector length configured for use on exec. + 8. ELF coredump extensions --------------------------- diff --git a/Documentation/arch/arm64/sve.rst b/Documentation/arch/arm64/sve.rst index 8d8837fc39ec..28152492c29c 100644 --- a/Documentation/arch/arm64/sve.rst +++ b/Documentation/arch/arm64/sve.rst @@ -402,6 +402,10 @@ The regset data starts with struct user_sve_header, containing: streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode if the target was not in streaming mode. +* If any register data is provided along with SVE_PT_VL_ONEXEC then the + registers data will be interpreted with the current vector length, not + the vector length configured for use on exec. + * The effect of writing a partial, incomplete payload is unspecified. diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index b756578aeaee..f09ffd70c916 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -898,7 +898,11 @@ static int sve_set_common(struct task_struct *target, if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC VL not the + * current VL: + */ vq = sve_vq_from_vl(task_get_vl(target, type)); /* Enter/exit streaming mode */ @@ -1125,7 +1129,11 @@ static int za_set(struct task_struct *target, if (ret) goto out; - /* Actual VL set may be less than the user asked for: */ + /* + * Actual VL set may be different from what the user asked + * for, or we may have configured the _ONEXEC rather than + * current VL: + */ vq = sve_vq_from_vl(task_get_sme_vl(target)); /* Ensure there is some SVE storage for streaming mode */ From 27141b690547da5650a420f26ec369ba142a9ebb Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 11 Nov 2024 16:18:55 +0000 Subject: [PATCH 166/168] kselftest/arm64: Don't leak pipe fds in pac.exec_sign_all() The PAC exec_sign_all() test spawns some child processes, creating pipes to be stdin and stdout for the child. It cleans up most of the file descriptors that are created as part of this but neglects to clean up the parent end of the child stdin and stdout. Add the missing close() calls. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241111-arm64-pac-test-collisions-v1-1-171875f37e44@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/pauth/pac.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c index b743daa772f5..5a07b3958fbf 100644 --- a/tools/testing/selftests/arm64/pauth/pac.c +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -182,6 +182,9 @@ int exec_sign_all(struct signatures *signed_vals, size_t val) return -1; } + close(new_stdin[1]); + close(new_stdout[0]); + return 0; } From 91a6533811bb81139c5a44d039b9b0a6af238bc8 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 11 Nov 2024 16:18:56 +0000 Subject: [PATCH 167/168] kselftest/arm64: Try harder to generate different keys during PAC tests We very intermittently see failures in the single_thread_different_keys PAC test. As noted in the comment in the test the PAC field can be quite narrow so there is a chance of collisions even with different keys with a chance of 5% for 7 bit keys, and the potential for narrower keys. The test tries to avoid this by running repeatedly, but only tries 10 times which even with a 5% chance of collisions isn't enough. Increase the number of times we attempt to look for collisions by a factor of 100, this also affects other tests which are following a similar pattern with running the test repeatedly and either don't care like with pac_instruction_not_nop or potentially have the same issue like exec_sign_all. The PAC tests are very fast, running in a second or two even in emulation, so the 100x increased cost is mildly irritating but not a huge issue. The bulk of the overhead is in the exec_sign_all test which does a fork() and exec() per iteration. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20241111-arm64-pac-test-collisions-v1-2-171875f37e44@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/pauth/pac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c index 5a07b3958fbf..6d21b2fc758d 100644 --- a/tools/testing/selftests/arm64/pauth/pac.c +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -13,7 +13,7 @@ #include "../../kselftest_harness.h" #include "helper.h" -#define PAC_COLLISION_ATTEMPTS 10 +#define PAC_COLLISION_ATTEMPTS 1000 /* * The kernel sets TBID by default. So bits 55 and above should remain * untouched no matter what. From 67ab51cbdfee02ef07fb9d7d14cc0bf6cb5a5e5c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 14 Nov 2024 09:53:32 +0000 Subject: [PATCH 168/168] arm64: tls: Fix context-switching of tpidrro_el0 when kpti is enabled Commit 18011eac28c7 ("arm64: tls: Avoid unconditional zeroing of tpidrro_el0 for native tasks") tried to optimise the context switching of tpidrro_el0 by eliding the clearing of the register when switching to a native task with kpti enabled, on the erroneous assumption that the kpti trampoline entry code would already have taken care of the write. Although the kpti trampoline does zero the register on entry from a native task, the check in tls_thread_switch() is on the *next* task and so we can end up leaving a stale, non-zero value in the register if the previous task was 32-bit. Drop the broken optimisation and zero tpidrro_el0 unconditionally when switching to a native 64-bit task. Cc: Mark Rutland Cc: stable@vger.kernel.org Fixes: 18011eac28c7 ("arm64: tls: Avoid unconditional zeroing of tpidrro_el0 for native tasks") Signed-off-by: Will Deacon Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20241114095332.23391-1-will@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 0540653fbf38..3d78798c4a0a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -439,7 +439,7 @@ static void tls_thread_switch(struct task_struct *next) if (is_compat_thread(task_thread_info(next))) write_sysreg(next->thread.uw.tp_value, tpidrro_el0); - else if (!arm64_kernel_unmapped_at_el0()) + else write_sysreg(0, tpidrro_el0); write_sysreg(*task_user_tls(next), tpidr_el0);