From bcc9d04e749a8cbdbe1b26285f0f69e315c70821 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:40 +0100
Subject: [PATCH 001/168] mm: Introduce ARCH_HAS_USER_SHADOW_STACK

Since multiple architectures have support for shadow stacks and we need to
select support for this feature in several places in the generic code
provide a generic config option that the architectures can select.

Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Tested-by: Kees Cook <kees@kernel.org>
Acked-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-1-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/x86/Kconfig   | 1 +
 fs/proc/task_mmu.c | 2 +-
 mm/Kconfig         | 6 ++++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2852fcd82cbd..8ccae77d40f7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1954,6 +1954,7 @@ config X86_USER_SHADOW_STACK
 	depends on AS_WRUSS
 	depends on X86_64
 	select ARCH_USES_HIGH_VMA_FLAGS
+	select ARCH_HAS_USER_SHADOW_STACK
 	select X86_CET
 	help
 	  Shadow stack protection is a hardware feature that detects function
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 72f14fd59c2d..23f875e78eae 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -971,7 +971,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
 		[ilog2(VM_UFFD_MINOR)]	= "ui",
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-#ifdef CONFIG_X86_USER_SHADOW_STACK
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
 		[ilog2(VM_SHADOW_STACK)] = "ss",
 #endif
 #if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
diff --git a/mm/Kconfig b/mm/Kconfig
index 4c9f5ea13271..4b2a1ef9a161 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1296,6 +1296,12 @@ config NUMA_EMU
 	  into virtual nodes when booted with "numa=fake=N", where N is the
 	  number of nodes. This is only useful for debugging.
 
+config ARCH_HAS_USER_SHADOW_STACK
+	bool
+	help
+	  The architecture has hardware support for userspace shadow call
+          stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+
 source "mm/damon/Kconfig"
 
 endmenu

From 9ab515b18f8463fbb340fece47cd461809e42a9d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:41 +0100
Subject: [PATCH 002/168] mm: Define VM_HIGH_ARCH_6

The addition of protection keys means that on arm64 we now use all of the
currently defined VM_HIGH_ARCH_x bits. In order to allow us to allocate a
new flag for GCS pages define VM_HIGH_ARCH_6.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-2-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecf63d2b0582..182bad0c55df 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -329,12 +329,14 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_3	35	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_4	36	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_BIT_5	37	/* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_6	38	/* bit only usable on 64-bit architectures */
 #define VM_HIGH_ARCH_0	BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1	BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2	BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3	BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4	BIT(VM_HIGH_ARCH_BIT_4)
 #define VM_HIGH_ARCH_5	BIT(VM_HIGH_ARCH_BIT_5)
+#define VM_HIGH_ARCH_6	BIT(VM_HIGH_ARCH_BIT_6)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS

From f645e888b1a6760532d8d89714cb698dd52d89bd Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:42 +0100
Subject: [PATCH 003/168] arm64/mm: Restructure arch_validate_flags() for
 extensibility

Currently arch_validate_flags() is written in a very non-extensible
fashion, returning immediately if MTE is not supported and writing the MTE
check as a direct return. Since we will want to add more checks for GCS
refactor the existing code to be more extensible, no functional change
intended.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-3-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mman.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 9e39217b4afb..03b790fd0ad8 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -62,11 +62,17 @@ static inline bool arch_validate_prot(unsigned long prot,
 
 static inline bool arch_validate_flags(unsigned long vm_flags)
 {
-	if (!system_supports_mte())
-		return true;
+	if (system_supports_mte()) {
+		/*
+		 * only allow VM_MTE if VM_MTE_ALLOWED has been set
+		 * previously
+		 */
+		if ((vm_flags & VM_MTE) && !(vm_flags & VM_MTE_ALLOWED))
+			return false;
+	}
+
+	return true;
 
-	/* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */
-	return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED);
 }
 #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
 

From 91e102e79740ae43ded050ccac71aa3371db4f33 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:43 +0100
Subject: [PATCH 004/168] prctl: arch-agnostic prctl for shadow stack

Three architectures (x86, aarch64, riscv) have announced support for
shadow stacks with fairly similar functionality.  While x86 is using
arch_prctl() to control the functionality neither arm64 nor riscv uses
that interface so this patch adds arch-agnostic prctl() support to
get and set status of shadow stacks and lock the current configuation to
prevent further changes, with support for turning on and off individual
subfeatures so applications can limit their exposure to features that
they do not need.  The features are:

  - PR_SHADOW_STACK_ENABLE: Tracking and enforcement of shadow stacks,
    including allocation of a shadow stack if one is not already
    allocated.
  - PR_SHADOW_STACK_WRITE: Writes to specific addresses in the shadow
    stack.
  - PR_SHADOW_STACK_PUSH: Push additional values onto the shadow stack.

These features are expected to be inherited by new threads and cleared
on exec(), unknown features should be rejected for enable but accepted
for locking (in order to allow for future proofing).

This is based on a patch originally written by Deepak Gupta but modified
fairly heavily, support for indirect landing pads is removed, additional
modes added and the locking interface reworked.  The set status prctl()
is also reworked to just set flags, if setting/reading the shadow stack
pointer is required this could be a separate prctl.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-4-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/mm.h         |  4 ++++
 include/uapi/linux/prctl.h | 22 ++++++++++++++++++++++
 kernel/sys.c               | 30 ++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 182bad0c55df..56654306a832 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4221,4 +4221,8 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
 }
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
+int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
+int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
+int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
+
 #endif /* _LINUX_MM_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..557a3d2ac1d4 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,26 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_MASK		0x1f
 
+/*
+ * Get the current shadow stack configuration for the current thread,
+ * this will be the value configured via PR_SET_SHADOW_STACK_STATUS.
+ */
+#define PR_GET_SHADOW_STACK_STATUS      74
+
+/*
+ * Set the current shadow stack configuration.  Enabling the shadow
+ * stack will cause a shadow stack to be allocated for the thread.
+ */
+#define PR_SET_SHADOW_STACK_STATUS      75
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+/*
+ * Prevent further changes to the specified shadow stack
+ * configuration.  All bits may be locked via this call, including
+ * undefined bits.
+ */
+#define PR_LOCK_SHADOW_STACK_STATUS      76
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda8..3d38a9c7c5c9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2324,6 +2324,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
 	return -EINVAL;
 }
 
+int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status)
+{
+	return -EINVAL;
+}
+
+int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+	return -EINVAL;
+}
+
+int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+	return -EINVAL;
+}
+
 #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
 
 #ifdef CONFIG_ANON_VMA_NAME
@@ -2784,6 +2799,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
 		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
 		break;
+	case PR_GET_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2);
+		break;
+	case PR_SET_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_set_shadow_stack_status(me, arg2);
+		break;
+	case PR_LOCK_SHADOW_STACK_STATUS:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = arch_lock_shadow_stack_status(me, arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;

From 3630e82ab6bd2642f0fc03b574783ccf2fb0c955 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:44 +0100
Subject: [PATCH 005/168] mman: Add map_shadow_stack() flags

In preparation for adding arm64 GCS support make the map_shadow_stack()
SHADOW_STACK_SET_TOKEN flag generic and add _SET_MARKER. The existing
flag indicates that a token usable for stack switch should be added to
the top of the newly mapped GCS region while the new flag indicates that
a top of stack marker suitable for use by unwinders should be added
above that.

For arm64 the top of stack marker is all bits 0.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-5-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/x86/include/uapi/asm/mman.h | 3 ---
 include/uapi/asm-generic/mman.h  | 4 ++++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 46cdc941f958..ac1e6277212b 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -5,9 +5,6 @@
 #define MAP_32BIT	0x40		/* only give out 32bit addresses */
 #define MAP_ABOVE4G	0x80		/* only map above 4GB */
 
-/* Flags for map_shadow_stack(2) */
-#define SHADOW_STACK_SET_TOKEN	(1ULL << 0)	/* Set up a restore token in the shadow stack */
-
 #include <asm-generic/mman.h>
 
 #endif /* _ASM_X86_MMAN_H */
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 57e8195d0b53..5e3d61ddbd8c 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -19,4 +19,8 @@
 #define MCL_FUTURE	2		/* lock all future mappings */
 #define MCL_ONFAULT	4		/* lock all pages that are faulted in */
 
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0)     /* Set up a restore token in the shadow stack */
+#define SHADOW_STACK_SET_MARKER (1ULL << 1)     /* Set up a top of stack marker in the shadow stack */
+
+
 #endif /* __ASM_GENERIC_MMAN_H */

From 830ae8a39685e330b70437529912c17337380ae4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:45 +0100
Subject: [PATCH 006/168] arm64: Document boot requirements for Guarded Control
 Stacks

FEAT_GCS introduces a number of new system registers, we require that
access to these registers is not trapped when we identify that the feature
is present.  There is also a HCRX_EL2 control to make GCS operations
functional.

Since if GCS is enabled any function call instruction will cause a fault
we also require that the feature be specifically disabled, existing
kernels implicitly have this requirement and especially given that the
MMU must be disabled it is difficult to see a situation where leaving
GCS enabled would be reasonable.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-6-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/booting.rst | 32 ++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst
index b57776a68f15..aed6e9f47cf3 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -411,6 +411,38 @@ Before jumping into the kernel, the following conditions must be met:
 
     - HFGRWR_EL2.nPIRE0_EL1 (bit 57) must be initialised to 0b1.
 
+ - For CPUs with Guarded Control Stacks (FEAT_GCS):
+
+  - GCSCR_EL1 must be initialised to 0.
+
+  - GCSCRE0_EL1 must be initialised to 0.
+
+  - If EL3 is present:
+
+    - SCR_EL3.GCSEn (bit 39) must be initialised to 0b1.
+
+  - If EL2 is present:
+
+    - GCSCR_EL2 must be initialised to 0.
+
+ - If the kernel is entered at EL1 and EL2 is present:
+
+    - HCRX_EL2.GCSEn must be initialised to 0b1.
+
+    - HFGITR_EL2.nGCSEPP (bit 59) must be initialised to 0b1.
+
+    - HFGITR_EL2.nGCSSTR_EL1 (bit 58) must be initialised to 0b1.
+
+    - HFGITR_EL2.nGCSPUSHM_EL1 (bit 57) must be initialised to 0b1.
+
+    - HFGRTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1.
+
+    - HFGRTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1.
+
+    - HFGWTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1.
+
+    - HFGWTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1.
+
 The requirements described above for CPU mode, caches, MMUs, architected
 timers, coherency and system registers apply to all CPUs.  All CPUs must
 enter the kernel in the same exception level.  Where the values documented

From 7058bf87cd597e0433c2e8207139f922b9df3ef8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:46 +0100
Subject: [PATCH 007/168] arm64/gcs: Document the ABI for Guarded Control
 Stacks

Add some documentation of the userspace ABI for Guarded Control Stacks.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-7-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/gcs.rst   | 230 +++++++++++++++++++++++++++++
 Documentation/arch/arm64/index.rst |   1 +
 2 files changed, 231 insertions(+)
 create mode 100644 Documentation/arch/arm64/gcs.rst

diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst
new file mode 100644
index 000000000000..af58d9151cb7
--- /dev/null
+++ b/Documentation/arch/arm64/gcs.rst
@@ -0,0 +1,230 @@
+===============================================
+Guarded Control Stack support for AArch64 Linux
+===============================================
+
+This document outlines briefly the interface provided to userspace by Linux in
+order to support use of the ARM Guarded Control Stack (GCS) feature.
+
+This is an outline of the most important features and issues only and not
+intended to be exhaustive.
+
+
+
+1.  General
+-----------
+
+* GCS is an architecture feature intended to provide greater protection
+  against return oriented programming (ROP) attacks and to simplify the
+  implementation of features that need to collect stack traces such as
+  profiling.
+
+* When GCS is enabled a separate guarded control stack is maintained by the
+  PE which is writeable only through specific GCS operations.  This
+  stores the call stack only, when a procedure call instruction is
+  performed the current PC is pushed onto the GCS and on RET the
+  address in the LR is verified against that on the top of the GCS.
+
+* When active the current GCS pointer is stored in the system register
+  GCSPR_EL0.  This is readable by userspace but can only be updated
+  via specific GCS instructions.
+
+* The architecture provides instructions for switching between guarded
+  control stacks with checks to ensure that the new stack is a valid
+  target for switching.
+
+* The functionality of GCS is similar to that provided by the x86 Shadow
+  Stack feature, due to sharing of userspace interfaces the ABI refers to
+  shadow stacks rather than GCS.
+
+* Support for GCS is reported to userspace via HWCAP_GCS in the aux vector
+  AT_HWCAP2 entry.
+
+* GCS is enabled per thread.  While there is support for disabling GCS
+  at runtime this should be done with great care.
+
+* GCS memory access faults are reported as normal memory access faults.
+
+* GCS specific errors (those reported with EC 0x2d) will be reported as
+  SIGSEGV with a si_code of SEGV_CPERR (control protection error).
+
+* GCS is supported only for AArch64.
+
+* On systems where GCS is supported GCSPR_EL0 is always readable by EL0
+  regardless of the GCS configuration for the thread.
+
+* The architecture supports enabling GCS without verifying that return values
+  in LR match those in the GCS, the LR will be ignored.  This is not supported
+  by Linux.
+
+
+
+2.  Enabling and disabling Guarded Control Stacks
+-------------------------------------------------
+
+* GCS is enabled and disabled for a thread via the PR_SET_SHADOW_STACK_STATUS
+  prctl(), this takes a single flags argument specifying which GCS features
+  should be used.
+
+* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack
+  and enables GCS for the thread, enabling the functionality controlled by
+  GCSCRE0_EL1.{nTR, RVCHKEN, PCRSEL}.
+
+* When set the PR_SHADOW_STACK_PUSH flag enables the functionality controlled
+  by GCSCRE0_EL1.PUSHMEn, allowing explicit GCS pushes.
+
+* When set the PR_SHADOW_STACK_WRITE flag enables the functionality controlled
+  by GCSCRE0_EL1.STREn, allowing explicit stores to the Guarded Control Stack.
+
+* Any unknown flags will cause PR_SET_SHADOW_STACK_STATUS to return -EINVAL.
+
+* PR_LOCK_SHADOW_STACK_STATUS is passed a bitmask of features with the same
+  values as used for PR_SET_SHADOW_STACK_STATUS.  Any future changes to the
+  status of the specified GCS mode bits will be rejected.
+
+* PR_LOCK_SHADOW_STACK_STATUS allows any bit to be locked, this allows
+  userspace to prevent changes to any future features.
+
+* There is no support for a process to remove a lock that has been set for
+  it.
+
+* PR_SET_SHADOW_STACK_STATUS and PR_LOCK_SHADOW_STACK_STATUS affect only the
+  thread that called them, any other running threads will be unaffected.
+
+* New threads inherit the GCS configuration of the thread that created them.
+
+* GCS is disabled on exec().
+
+* The current GCS configuration for a thread may be read with the
+  PR_GET_SHADOW_STACK_STATUS prctl(), this returns the same flags that
+  are passed to PR_SET_SHADOW_STACK_STATUS.
+
+* If GCS is disabled for a thread after having previously been enabled then
+  the stack will remain allocated for the lifetime of the thread.  At present
+  any attempt to reenable GCS for the thread will be rejected, this may be
+  revisited in future.
+
+* It should be noted that since enabling GCS will result in GCS becoming
+  active immediately it is not normally possible to return from the function
+  that invoked the prctl() that enabled GCS.  It is expected that the normal
+  usage will be that GCS is enabled very early in execution of a program.
+
+
+
+3.  Allocation of Guarded Control Stacks
+----------------------------------------
+
+* When GCS is enabled for a thread a new Guarded Control Stack will be
+  allocated for it of half the standard stack size or 2 gigabytes,
+  whichever is smaller.
+
+* When a new thread is created by a thread which has GCS enabled then a
+  new Guarded Control Stack will be allocated for the new thread with
+  half the size of the standard stack.
+
+* When a stack is allocated by enabling GCS or during thread creation then
+  the top 8 bytes of the stack will be initialised to 0 and GCSPR_EL0 will
+  be set to point to the address of this 0 value, this can be used to
+  detect the top of the stack.
+
+* Additional Guarded Control Stacks can be allocated using the
+  map_shadow_stack() system call.
+
+* Stacks allocated using map_shadow_stack() can optionally have an end of
+  stack marker and cap placed at the top of the stack.  If the flag
+  SHADOW_STACK_SET_TOKEN is specified a cap will be placed on the stack,
+  if SHADOW_STACK_SET_MARKER is not specified the cap will be the top 8
+  bytes of the stack and if it is specified then the cap will be the next
+  8 bytes.  While specifying just SHADOW_STACK_SET_MARKER by itself is
+  valid since the marker is all bits 0 it has no observable effect.
+
+* Stacks allocated using map_shadow_stack() must have a size which is a
+  multiple of 8 bytes larger than 8 bytes and must be 8 bytes aligned.
+
+* An address can be specified to map_shadow_stack(), if one is provided then
+  it must be aligned to a page boundary.
+
+* When a thread is freed the Guarded Control Stack initially allocated for
+  that thread will be freed.  Note carefully that if the stack has been
+  switched this may not be the stack currently in use by the thread.
+
+
+4.  Signal handling
+--------------------
+
+* A new signal frame record gcs_context encodes the current GCS mode and
+  pointer for the interrupted context on signal delivery.  This will always
+  be present on systems that support GCS.
+
+* The record contains a flag field which reports the current GCS configuration
+  for the interrupted context as PR_GET_SHADOW_STACK_STATUS would.
+
+* The signal handler is run with the same GCS configuration as the interrupted
+  context.
+
+* When GCS is enabled for the interrupted thread a signal handling specific
+  GCS cap token will be written to the GCS, this is an architectural GCS cap
+  with the token type (bits 0..11) all clear.  The GCSPR_EL0 reported in the
+  signal frame will point to this cap token.
+
+* The signal handler will use the same GCS as the interrupted context.
+
+* When GCS is enabled on signal entry a frame with the address of the signal
+  return handler will be pushed onto the GCS, allowing return from the signal
+  handler via RET as normal.  This will not be reported in the gcs_context in
+  the signal frame.
+
+
+5.  Signal return
+-----------------
+
+When returning from a signal handler:
+
+* If there is a gcs_context record in the signal frame then the GCS flags
+  and GCSPR_EL0 will be restored from that context prior to further
+  validation.
+
+* If there is no gcs_context record in the signal frame then the GCS
+  configuration will be unchanged.
+
+* If GCS is enabled on return from a signal handler then GCSPR_EL0 must
+  point to a valid GCS signal cap record, this will be popped from the
+  GCS prior to signal return.
+
+* If the GCS configuration is locked when returning from a signal then any
+  attempt to change the GCS configuration will be treated as an error.  This
+  is true even if GCS was not enabled prior to signal entry.
+
+* GCS may be disabled via signal return but any attempt to enable GCS via
+  signal return will be rejected.
+
+
+6.  ptrace extensions
+---------------------
+
+* A new regset NT_ARM_GCS is defined for use with PTRACE_GETREGSET and
+  PTRACE_SETREGSET.
+
+* Due to the complexity surrounding allocation and deallocation of stacks and
+  lack of practical application it is not possible to enable GCS via ptrace.
+  GCS may be disabled via the ptrace interface.
+
+* Other GCS modes may be configured via ptrace.
+
+* Configuration via ptrace ignores locking of GCS mode bits.
+
+
+7.  ELF coredump extensions
+---------------------------
+
+* NT_ARM_GCS notes will be added to each coredump for each thread of the
+  dumped process.  The contents will be equivalent to the data that would
+  have been read if a PTRACE_GETREGSET of the corresponding type were
+  executed for each thread when the coredump was generated.
+
+
+
+8.  /proc extensions
+--------------------
+
+* Guarded Control Stack pages will include "ss" in their VmFlags in
+  /proc/<pid>/smaps.
diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst
index 78544de0a8a9..056f6a739d25 100644
--- a/Documentation/arch/arm64/index.rst
+++ b/Documentation/arch/arm64/index.rst
@@ -15,6 +15,7 @@ ARM64 Architecture
     cpu-feature-registers
     cpu-hotplug
     elf_hwcaps
+    gcs
     hugetlbpage
     kdump
     legacy_instructions

From ce0641d48ddd240053138ce55c3423f833a4237b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:47 +0100
Subject: [PATCH 008/168] arm64/sysreg: Add definitions for architected GCS
 caps

The architecture defines a format for guarded control stack caps, used
to mark the top of an unused GCS in order to limit the potential for
exploitation via stack switching. Add definitions associated with these.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-8-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 9ea97dddefc4..9c98ff448bd9 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1101,6 +1101,26 @@
 /* Initial value for Permission Overlay Extension for EL0 */
 #define POR_EL0_INIT	POE_RXW
 
+/*
+ * Definitions for Guarded Control Stack
+ */
+
+#define GCS_CAP_ADDR_MASK		GENMASK(63, 12)
+#define GCS_CAP_ADDR_SHIFT		12
+#define GCS_CAP_ADDR_WIDTH		52
+#define GCS_CAP_ADDR(x)			FIELD_GET(GCS_CAP_ADDR_MASK, x)
+
+#define GCS_CAP_TOKEN_MASK		GENMASK(11, 0)
+#define GCS_CAP_TOKEN_SHIFT		0
+#define GCS_CAP_TOKEN_WIDTH		12
+#define GCS_CAP_TOKEN(x)		FIELD_GET(GCS_CAP_TOKEN_MASK, x)
+
+#define GCS_CAP_VALID_TOKEN		0x1
+#define GCS_CAP_IN_PROGRESS_TOKEN	0x5
+
+#define GCS_CAP(x)	((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
+					       GCS_CAP_VALID_TOKEN)
+
 #define ARM64_FEATURE_FIELD_BITS	4
 
 /* Defined for compatibility only, do not add new users. */

From dad947cc22cff28348d04e21fa4d6c882385fd7d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:48 +0100
Subject: [PATCH 009/168] arm64/gcs: Add manual encodings of GCS instructions

Define C callable functions for GCS instructions used by the kernel. In
order to avoid ambitious toolchain requirements for GCS support these are
manually encoded, this means we have fixed register numbers which will be
a bit limiting for the compiler but none of these should be used in
sufficiently fast paths for this to be a problem.

Note that GCSSTTR is used to store to EL0.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-9-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/gcs.h     | 51 ++++++++++++++++++++++++++++++++
 arch/arm64/include/asm/uaccess.h | 22 ++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 arch/arm64/include/asm/gcs.h

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
new file mode 100644
index 000000000000..7c5e95218db6
--- /dev/null
+++ b/arch/arm64/include/asm/gcs.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+#ifndef __ASM_GCS_H
+#define __ASM_GCS_H
+
+#include <asm/types.h>
+#include <asm/uaccess.h>
+
+static inline void gcsb_dsync(void)
+{
+	asm volatile(".inst 0xd503227f" : : : "memory");
+}
+
+static inline void gcsstr(u64 *addr, u64 val)
+{
+	register u64 *_addr __asm__ ("x0") = addr;
+	register long _val __asm__ ("x1") = val;
+
+	/* GCSSTTR x1, x0 */
+	asm volatile(
+		".inst 0xd91f1c01\n"
+		:
+		: "rZ" (_val), "r" (_addr)
+		: "memory");
+}
+
+static inline void gcsss1(u64 Xt)
+{
+	asm volatile (
+		"sys #3, C7, C7, #2, %0\n"
+		:
+		: "rZ" (Xt)
+		: "memory");
+}
+
+static inline u64 gcsss2(void)
+{
+	u64 Xt;
+
+	asm volatile(
+		"SYSL %0, #3, C7, C7, #3\n"
+		: "=r" (Xt)
+		:
+		: "memory");
+
+	return Xt;
+}
+
+#endif
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 1aa4ecb73429..0db494b24dd0 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -502,4 +502,26 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr,
 
 #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */
 
+#ifdef CONFIG_ARM64_GCS
+
+static inline int gcssttr(unsigned long __user *addr, unsigned long val)
+{
+	register unsigned long __user *_addr __asm__ ("x0") = addr;
+	register unsigned long _val __asm__ ("x1") = val;
+	int err = 0;
+
+	/* GCSSTTR x1, x0 */
+	asm volatile(
+		"1: .inst 0xd91f1c01\n"
+		"2: \n"
+		_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)
+		: "+r" (err)
+		: "rZ" (_val), "r" (_addr)
+		: "memory");
+
+	return err;
+}
+
+#endif /* CONFIG_ARM64_GCS */
+
 #endif /* __ASM_UACCESS_H */

From d0aa2b4351862cc2ce8d97e00c96bffc02ea16af Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:49 +0100
Subject: [PATCH 010/168] arm64/gcs: Provide put_user_gcs()

In order for EL1 to write to an EL0 GCS it must use the GCSSTTR instruction
rather than a normal STTR. Provide a put_user_gcs() which does this.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-10-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/uaccess.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 0db494b24dd0..5b91803201ef 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -522,6 +522,24 @@ static inline int gcssttr(unsigned long __user *addr, unsigned long val)
 	return err;
 }
 
+static inline void put_user_gcs(unsigned long val, unsigned long __user *addr,
+				int *err)
+{
+	int ret;
+
+	if (!access_ok((char __user *)addr, sizeof(u64))) {
+		*err = -EFAULT;
+		return;
+	}
+
+	uaccess_ttbr0_enable();
+	ret = gcssttr(addr, val);
+	if (ret != 0)
+		*err = ret;
+	uaccess_ttbr0_disable();
+}
+
+
 #endif /* CONFIG_ARM64_GCS */
 
 #endif /* __ASM_UACCESS_H */

From ff5181d8a2a82c982276a7e035896185c390e856 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:50 +0100
Subject: [PATCH 011/168] arm64/gcs: Provide basic EL2 setup to allow GCS usage
 at EL0 and EL1

There is a control HCRX_EL2.GCSEn which must be set to allow GCS
features to take effect at lower ELs and also fine grained traps for GCS
usage at EL0 and EL1.  Configure all these to allow GCS usage by EL0 and
EL1.

We also initialise GCSCR_EL1 and GCSCRE0_EL1 to ensure that we can
execute function call instructions without faulting regardless of the
state when the kernel is started.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-11-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/el2_setup.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index e0ffdf13a18b..27086a81eae3 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -27,6 +27,14 @@
 	ubfx	x0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4
 	cbz	x0, .Lskip_hcrx_\@
 	mov_q	x0, HCRX_HOST_FLAGS
+
+        /* Enable GCS if supported */
+	mrs_s	x1, SYS_ID_AA64PFR1_EL1
+	ubfx	x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+	cbz	x1, .Lset_hcrx_\@
+	orr	x0, x0, #HCRX_EL2_GCSEn
+
+.Lset_hcrx_\@:
 	msr_s	SYS_HCRX_EL2, x0
 .Lskip_hcrx_\@:
 .endm
@@ -200,6 +208,16 @@
 	orr	x0, x0, #HFGxTR_EL2_nPOR_EL0
 
 .Lskip_poe_fgt_\@:
+	/* GCS depends on PIE so we don't check it if PIE is absent */
+	mrs_s	x1, SYS_ID_AA64PFR1_EL1
+	ubfx	x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+	cbz	x1, .Lset_fgt_\@
+
+	/* Disable traps of access to GCS registers at EL0 and EL1 */
+	orr	x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK
+	orr	x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK
+
+.Lset_fgt_\@:
 	msr_s	SYS_HFGRTR_EL2, x0
 	msr_s	SYS_HFGWTR_EL2, x0
 	msr_s	SYS_HFGITR_EL2, xzr
@@ -215,6 +233,17 @@
 .Lskip_fgt_\@:
 .endm
 
+.macro __init_el2_gcs
+	mrs_s	x1, SYS_ID_AA64PFR1_EL1
+	ubfx	x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+	cbz	x1, .Lskip_gcs_\@
+
+	/* Ensure GCS is not enabled when we start trying to do BLs */
+	msr_s	SYS_GCSCR_EL1, xzr
+	msr_s	SYS_GCSCRE0_EL1, xzr
+.Lskip_gcs_\@:
+.endm
+
 .macro __init_el2_nvhe_prepare_eret
 	mov	x0, #INIT_PSTATE_EL1
 	msr	spsr_el2, x0
@@ -240,6 +269,7 @@
 	__init_el2_nvhe_idregs
 	__init_el2_cptr
 	__init_el2_fgt
+        __init_el2_gcs
 .endm
 
 #ifndef __KVM_NVHE_HYPERVISOR__

From 6487c963083c24ede289d4267ffa60a9db668cd4 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:51 +0100
Subject: [PATCH 012/168] arm64/cpufeature: Runtime detection of Guarded
 Control Stack (GCS)

Add a cpufeature for GCS, allowing other code to conditionally support it
at runtime.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-12-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  6 ++++++
 arch/arm64/kernel/cpufeature.c      | 20 ++++++++++++++++++++
 arch/arm64/tools/cpucaps            |  1 +
 3 files changed, 27 insertions(+)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 3d261cc123c1..69470795f5d2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -838,6 +838,12 @@ static inline bool system_supports_poe(void)
 		alternative_has_cap_unlikely(ARM64_HAS_S1POE);
 }
 
+static inline bool system_supports_gcs(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_GCS) &&
+		alternative_has_cap_unlikely(ARM64_HAS_GCS);
+}
+
 int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
 bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 718728a85430..d1e758e99e0a 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -291,6 +291,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS),
+		       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
 		       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0),
@@ -2358,6 +2360,14 @@ static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused)
 }
 #endif
 
+#ifdef CONFIG_ARM64_GCS
+static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused)
+{
+	/* GCSPR_EL0 is always readable */
+	write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
+}
+#endif
+
 /* Internal helper functions to match cpu capability type */
 static bool
 cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@ -2889,6 +2899,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpu_enable = cpu_enable_poe,
 		ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP)
 	},
+#endif
+#ifdef CONFIG_ARM64_GCS
+	{
+		.desc = "Guarded Control Stack (GCS)",
+		.capability = ARM64_HAS_GCS,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.cpu_enable = cpu_enable_gcs,
+		.matches = has_cpuid_feature,
+		ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP)
+	},
 #endif
 	{},
 };
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index eedb5acc21ed..867d25d4a45a 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -29,6 +29,7 @@ HAS_EVT
 HAS_FPMR
 HAS_FGT
 HAS_FPSIMD
+HAS_GCS
 HAS_GENERIC_AUTH
 HAS_GENERIC_AUTH_ARCH_QARMA3
 HAS_GENERIC_AUTH_ARCH_QARMA5

From 092055f1508cce6f60d4927fe8a048d76bbad73e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:52 +0100
Subject: [PATCH 013/168] arm64/mm: Allocate PIE slots for EL0 guarded control
 stack

Pages used for guarded control stacks need to be described to the hardware
using the Permission Indirection Extension, GCS is not supported without
PIE. In order to support copy on write for guarded stacks we allocate two
values, one for active GCSs and one for GCS pages marked as read only prior
to copy.

Since the actual effect is defined using PIE the specific bit pattern used
does not matter to the hardware but we choose two values which differ only
in PTE_WRITE in order to help share code with non-PIE cases.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-13-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable-prot.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 2a11d0c10760..4e4bcd676f4c 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -144,15 +144,23 @@ static inline bool __pure lpa2_is_enabled(void)
 /* 6:                                PTE_PXN | PTE_WRITE            */
 /* 7: PAGE_SHARED_EXEC               PTE_PXN | PTE_WRITE | PTE_USER */
 /* 8: PAGE_KERNEL_ROX      PTE_UXN                                  */
-/* 9:                      PTE_UXN |                       PTE_USER */
+/* 9: PAGE_GCS_RO          PTE_UXN |                       PTE_USER */
 /* a: PAGE_KERNEL_EXEC     PTE_UXN |           PTE_WRITE            */
-/* b:                      PTE_UXN |           PTE_WRITE | PTE_USER */
+/* b: PAGE_GCS             PTE_UXN |           PTE_WRITE | PTE_USER */
 /* c: PAGE_KERNEL_RO       PTE_UXN | PTE_PXN                        */
 /* d: PAGE_READONLY        PTE_UXN | PTE_PXN |             PTE_USER */
 /* e: PAGE_KERNEL          PTE_UXN | PTE_PXN | PTE_WRITE            */
 /* f: PAGE_SHARED          PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */
 
+#define _PAGE_GCS	(_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER)
+#define _PAGE_GCS_RO	(_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER)
+
+#define PAGE_GCS	__pgprot(_PAGE_GCS)
+#define PAGE_GCS_RO	__pgprot(_PAGE_GCS_RO)
+
 #define PIE_E0	( \
+	PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS),           PIE_GCS)  | \
+	PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO),        PIE_R)   | \
 	PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY),      PIE_X_O) | \
 	PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O)  | \
 	PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC),   PIE_RWX_O) | \
@@ -160,6 +168,8 @@ static inline bool __pure lpa2_is_enabled(void)
 	PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED),        PIE_RW_O))
 
 #define PIE_E1	( \
+	PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS),           PIE_NONE_O) | \
+	PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO),        PIE_NONE_O) | \
 	PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY),      PIE_NONE_O) | \
 	PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R)      | \
 	PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC),   PIE_RW)     | \

From ae80e1629aeaf5be726a9ea94eb7345b1a44b00d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:53 +0100
Subject: [PATCH 014/168] mm: Define VM_SHADOW_STACK for arm64 when we support
 GCS

Use VM_HIGH_ARCH_5 for guarded control stack pages.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-14-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/filesystems/proc.rst |  2 +-
 include/linux/mm.h                 | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index e834779d9611..6a882c57a7e7 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -579,7 +579,7 @@ encoded manner. The codes are the following:
     mt    arm64 MTE allocation tags are enabled
     um    userfaultfd missing tracking
     uw    userfaultfd wr-protect tracking
-    ss    shadow stack page
+    ss    shadow/guarded control stack page
     sl    sealed
     ==    =======================================
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 56654306a832..8852c39c7695 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -367,7 +367,17 @@ extern unsigned int kobjsize(const void *objp);
  * for more details on the guard size.
  */
 # define VM_SHADOW_STACK	VM_HIGH_ARCH_5
-#else
+#endif
+
+#if defined(CONFIG_ARM64_GCS)
+/*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+# define VM_SHADOW_STACK	VM_HIGH_ARCH_6
+#endif
+
+#ifndef VM_SHADOW_STACK
 # define VM_SHADOW_STACK	VM_NONE
 #endif
 

From 6497b66ba6945f142902c7e8fce86e47016ead1c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:54 +0100
Subject: [PATCH 015/168] arm64/mm: Map pages for guarded control stack

Map pages flagged as being part of a GCS as such rather than using the
full set of generic VM flags.

This is done using a conditional rather than extending the size of
protection_map since that would make for a very sparse array.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-15-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mman.h | 9 +++++++++
 arch/arm64/mm/mmap.c          | 9 ++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 03b790fd0ad8..f6d784f8e6e0 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -71,6 +71,15 @@ static inline bool arch_validate_flags(unsigned long vm_flags)
 			return false;
 	}
 
+	if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
+		/* An executable GCS isn't a good idea. */
+		if (vm_flags & VM_EXEC)
+			return false;
+
+		/* The memory management core should prevent this */
+		VM_WARN_ON(vm_flags & VM_SHARED);
+	}
+
 	return true;
 
 }
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 7e3ad97e27d8..07aeab8a7606 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -83,8 +83,15 @@ arch_initcall(adjust_protection_map);
 
 pgprot_t vm_get_page_prot(unsigned long vm_flags)
 {
-	pteval_t prot = pgprot_val(protection_map[vm_flags &
+	pteval_t prot;
+
+	/* Short circuit GCS to avoid bloating the table. */
+	if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
+		prot = _PAGE_GCS_RO;
+	} else {
+		prot = pgprot_val(protection_map[vm_flags &
 				   (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
+	}
 
 	if (vm_flags & VM_ARM64_BTI)
 		prot |= PTE_GP;

From a94452112ce4020af073af368d7c25a07bfabf37 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:56 +0100
Subject: [PATCH 016/168] arm64/idreg: Add overrride for GCS

Hook up an override for GCS, allowing it to be disabled from the command
line by specifying arm64.nogcs in case there are problems.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-17-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/admin-guide/kernel-parameters.txt | 3 +++
 arch/arm64/kernel/pi/idreg-override.c           | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1518343bbe22..c1b00f709734 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -446,6 +446,9 @@
 	arm64.nobti	[ARM64] Unconditionally disable Branch Target
 			Identification support
 
+	arm64.nogcs	[ARM64] Unconditionally disable Guarded Control Stack
+			support
+
 	arm64.nomops	[ARM64] Unconditionally disable Memory Copy and Memory
 			Set instructions support
 
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index 29d4b6244a6f..2bb709d78405 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -133,6 +133,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = {
 	.override	= &id_aa64pfr1_override,
 	.fields		= {
 		FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ),
+		FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL),
 		FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL),
 		FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter),
 		{}
@@ -215,6 +216,7 @@ static const struct {
 	{ "arm64.nosve",		"id_aa64pfr0.sve=0" },
 	{ "arm64.nosme",		"id_aa64pfr1.sme=0" },
 	{ "arm64.nobti",		"id_aa64pfr1.bt=0" },
+	{ "arm64.nogcs",		"id_aa64pfr1.gcs=0" },
 	{ "arm64.nopauth",
 	  "id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 "
 	  "id_aa64isar1.api=0 id_aa64isar1.apa=0 "

From eefc98711f84abd7ffb074af0cdbc5f8a8464272 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:57 +0100
Subject: [PATCH 017/168] arm64/hwcap: Add hwcap for GCS

Provide a hwcap to enable userspace to detect support for GCS.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-18-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/elf_hwcaps.rst | 4 ++++
 arch/arm64/include/asm/hwcap.h          | 1 +
 arch/arm64/include/uapi/asm/hwcap.h     | 3 ++-
 arch/arm64/kernel/cpufeature.c          | 3 +++
 arch/arm64/kernel/cpuinfo.c             | 1 +
 5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index 694f67fa07d1..25b41ff74fa0 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -170,6 +170,10 @@ HWCAP_PACG
     ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
     Documentation/arch/arm64/pointer-authentication.rst.
 
+HWCAP_GCS
+    Functionality implied by ID_AA64PFR1_EL1.GCS == 0b1, as
+    described by Documentation/arch/arm64/gcs.rst.
+
 HWCAP2_DCPODP
     Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index a775adddecf2..7bcf1347ca0b 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -92,6 +92,7 @@
 #define KERNEL_HWCAP_SB			__khwcap_feature(SB)
 #define KERNEL_HWCAP_PACA		__khwcap_feature(PACA)
 #define KERNEL_HWCAP_PACG		__khwcap_feature(PACG)
+#define KERNEL_HWCAP_GCS		__khwcap_feature(GCS)
 
 #define __khwcap2_feature(x)		(const_ilog2(HWCAP2_ ## x) + 64)
 #define KERNEL_HWCAP_DCPODP		__khwcap2_feature(DCPODP)
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 055381b2c615..675642ec4d91 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -21,7 +21,7 @@
  * HWCAP flags - for AT_HWCAP
  *
  * Bits 62 and 63 are reserved for use by libc.
- * Bits 32-61 are unallocated for potential use by libc.
+ * Bits 33-61 are unallocated for potential use by libc.
  */
 #define HWCAP_FP		(1 << 0)
 #define HWCAP_ASIMD		(1 << 1)
@@ -55,6 +55,7 @@
 #define HWCAP_SB		(1 << 29)
 #define HWCAP_PACA		(1 << 30)
 #define HWCAP_PACG		(1UL << 31)
+#define HWCAP_GCS		(1UL << 32)
 
 /*
  * HWCAP2 flags - for AT_HWCAP2
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d1e758e99e0a..b8655d55f318 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3025,6 +3025,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM),
 	HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM),
 	HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM),
+#endif
+#ifdef CONFIG_ARM64_GCS
+	HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS),
 #endif
 	HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS),
 #ifdef CONFIG_ARM64_BTI
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 44718d0482b3..f2f92c6b1c85 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -80,6 +80,7 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_SB]		= "sb",
 	[KERNEL_HWCAP_PACA]		= "paca",
 	[KERNEL_HWCAP_PACG]		= "pacg",
+	[KERNEL_HWCAP_GCS]		= "gcs",
 	[KERNEL_HWCAP_DCPODP]		= "dcpodp",
 	[KERNEL_HWCAP_SVE2]		= "sve2",
 	[KERNEL_HWCAP_SVEAES]		= "sveaes",

From 8ce71d270536dd7a48698a2b18ddf13f2d5007fb Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:58 +0100
Subject: [PATCH 018/168] arm64/traps: Handle GCS exceptions

A new exception code is defined for GCS specific faults other than
standard load/store faults, for example GCS token validation failures,
add handling for this. These faults are reported to userspace as
segfaults with code SEGV_CPERR (protection error), mirroring the
reporting for x86 shadow stack errors.

GCS faults due to memory load/store operations generate data aborts with
a flag set, these will be handled separately as part of the data abort
handling.

Since we do not currently enable GCS for EL1 we should not get any faults
there but while we're at it we wire things up there, treating any GCS
fault as fatal.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-19-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/esr.h       | 28 +++++++++++++++++++++++++++-
 arch/arm64/include/asm/exception.h |  2 ++
 arch/arm64/kernel/entry-common.c   | 23 +++++++++++++++++++++++
 arch/arm64/kernel/traps.c          | 11 +++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index da6d2c1c0b03..d1b1a33f9a8b 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -51,7 +51,8 @@
 #define ESR_ELx_EC_FP_EXC32	UL(0x28)
 /* Unallocated EC: 0x29 - 0x2B */
 #define ESR_ELx_EC_FP_EXC64	UL(0x2C)
-/* Unallocated EC: 0x2D - 0x2E */
+#define ESR_ELx_EC_GCS		UL(0x2D)
+/* Unallocated EC:  0x2E */
 #define ESR_ELx_EC_SERROR	UL(0x2F)
 #define ESR_ELx_EC_BREAKPT_LOW	UL(0x30)
 #define ESR_ELx_EC_BREAKPT_CUR	UL(0x31)
@@ -386,6 +387,31 @@
 #define ESR_ELx_MOPS_ISS_SRCREG(esr)	(((esr) & (UL(0x1f) << 5)) >> 5)
 #define ESR_ELx_MOPS_ISS_SIZEREG(esr)	(((esr) & (UL(0x1f) << 0)) >> 0)
 
+/* ISS field definitions for GCS */
+#define ESR_ELx_ExType_SHIFT	(20)
+#define ESR_ELx_ExType_MASK		GENMASK(23, 20)
+#define ESR_ELx_Raddr_SHIFT		(10)
+#define ESR_ELx_Raddr_MASK		GENMASK(14, 10)
+#define ESR_ELx_Rn_SHIFT		(5)
+#define ESR_ELx_Rn_MASK			GENMASK(9, 5)
+#define ESR_ELx_Rvalue_SHIFT		5
+#define ESR_ELx_Rvalue_MASK		GENMASK(9, 5)
+#define ESR_ELx_IT_SHIFT		(0)
+#define ESR_ELx_IT_MASK			GENMASK(4, 0)
+
+#define ESR_ELx_ExType_DATA_CHECK	0
+#define ESR_ELx_ExType_EXLOCK		1
+#define ESR_ELx_ExType_STR		2
+
+#define ESR_ELx_IT_RET			0
+#define ESR_ELx_IT_GCSPOPM		1
+#define ESR_ELx_IT_RET_KEYA		2
+#define ESR_ELx_IT_RET_KEYB		3
+#define ESR_ELx_IT_GCSSS1		4
+#define ESR_ELx_IT_GCSSS2		5
+#define ESR_ELx_IT_GCSPOPCX		6
+#define ESR_ELx_IT_GCSPOPX		7
+
 #ifndef __ASSEMBLY__
 #include <asm/types.h>
 
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index f296662590c7..674518464718 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -57,6 +57,8 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr);
 void do_el1_undef(struct pt_regs *regs, unsigned long esr);
 void do_el0_bti(struct pt_regs *regs);
 void do_el1_bti(struct pt_regs *regs, unsigned long esr);
+void do_el0_gcs(struct pt_regs *regs, unsigned long esr);
+void do_el1_gcs(struct pt_regs *regs, unsigned long esr);
 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
 			struct pt_regs *regs);
 void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs);
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 3fcd9d080bf2..fe74813009bd 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -463,6 +463,15 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr)
 	exit_to_kernel_mode(regs);
 }
 
+static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr)
+{
+	enter_from_kernel_mode(regs);
+	local_daif_inherit(regs);
+	do_el1_gcs(regs, esr);
+	local_daif_mask();
+	exit_to_kernel_mode(regs);
+}
+
 static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr)
 {
 	unsigned long far = read_sysreg(far_el1);
@@ -505,6 +514,9 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 	case ESR_ELx_EC_BTI:
 		el1_bti(regs, esr);
 		break;
+	case ESR_ELx_EC_GCS:
+		el1_gcs(regs, esr);
+		break;
 	case ESR_ELx_EC_BREAKPT_CUR:
 	case ESR_ELx_EC_SOFTSTP_CUR:
 	case ESR_ELx_EC_WATCHPT_CUR:
@@ -684,6 +696,14 @@ static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr)
 	exit_to_user_mode(regs);
 }
 
+static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr)
+{
+	enter_from_user_mode(regs);
+	local_daif_restore(DAIF_PROCCTX);
+	do_el0_gcs(regs, esr);
+	exit_to_user_mode(regs);
+}
+
 static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr)
 {
 	enter_from_user_mode(regs);
@@ -766,6 +786,9 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs)
 	case ESR_ELx_EC_MOPS:
 		el0_mops(regs, esr);
 		break;
+	case ESR_ELx_EC_GCS:
+		el0_gcs(regs, esr);
+		break;
 	case ESR_ELx_EC_BREAKPT_LOW:
 	case ESR_ELx_EC_SOFTSTP_LOW:
 	case ESR_ELx_EC_WATCHPT_LOW:
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 563cbce11126..fdbcf047108c 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -506,6 +506,16 @@ void do_el1_bti(struct pt_regs *regs, unsigned long esr)
 	die("Oops - BTI", regs, esr);
 }
 
+void do_el0_gcs(struct pt_regs *regs, unsigned long esr)
+{
+	force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0);
+}
+
+void do_el1_gcs(struct pt_regs *regs, unsigned long esr)
+{
+	die("Oops - GCS", regs, esr);
+}
+
 void do_el0_fpac(struct pt_regs *regs, unsigned long esr)
 {
 	force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr);
@@ -852,6 +862,7 @@ static const char *esr_class_str[] = {
 	[ESR_ELx_EC_MOPS]		= "MOPS",
 	[ESR_ELx_EC_FP_EXC32]		= "FP (AArch32)",
 	[ESR_ELx_EC_FP_EXC64]		= "FP (AArch64)",
+	[ESR_ELx_EC_GCS]		= "Guarded Control Stack",
 	[ESR_ELx_EC_SERROR]		= "SError",
 	[ESR_ELx_EC_BREAKPT_LOW]	= "Breakpoint (lower EL)",
 	[ESR_ELx_EC_BREAKPT_CUR]	= "Breakpoint (current EL)",

From cfad706e8f6ded5cec69f820bceeca9e64394592 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:58:59 +0100
Subject: [PATCH 019/168] arm64/mm: Handle GCS data aborts

All GCS operations at EL0 must happen on a page which is marked as
having UnprivGCS access, including read operations.  If a GCS operation
attempts to access a page without this then it will generate a data
abort with the GCS bit set in ESR_EL1.ISS2.

EL0 may validly generate such faults, for example due to copy on write
which will cause the GCS data to be stored in a read only page with no
GCS permissions until the actual copy happens.  Since UnprivGCS allows
both reads and writes to the GCS (though only through GCS operations) we
need to ensure that the memory management subsystem handles GCS accesses
as writes at all times.  Do this by adding FAULT_FLAG_WRITE to any GCS
page faults, adding handling to ensure that invalid cases are identfied
as such early so the memory management core does not think they will
succeed.  The core cannot distinguish between VMAs which are generally
writeable and VMAs which are only writeable through GCS operations.

EL1 may validly write to EL0 GCS for management purposes (eg, while
initialising with cap tokens).

We also report any GCS faults in VMAs not marked as part of a GCS as
access violations, causing a fault to be delivered to userspace if it
attempts to do GCS operations outside a GCS.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-20-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/fault.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 8b281cf308b3..c2f89a678ac0 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -504,6 +504,14 @@ static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
 			false);
 }
 
+static bool is_gcs_fault(unsigned long esr)
+{
+	if (!esr_is_data_abort(esr))
+		return false;
+
+	return ESR_ELx_ISS2(esr) & ESR_ELx_GCS;
+}
+
 static bool is_el0_instruction_abort(unsigned long esr)
 {
 	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
@@ -518,6 +526,23 @@ static bool is_write_abort(unsigned long esr)
 	return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
 }
 
+static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr)
+{
+	if (!system_supports_gcs())
+		return false;
+
+	if (unlikely(is_gcs_fault(esr))) {
+		/* GCS accesses must be performed on a GCS page */
+		if (!(vma->vm_flags & VM_SHADOW_STACK))
+			return true;
+	} else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) {
+		/* Only GCS operations can write to a GCS page */
+		return esr_is_data_abort(esr) && is_write_abort(esr);
+	}
+
+	return false;
+}
+
 static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 				   struct pt_regs *regs)
 {
@@ -554,6 +579,14 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 		/* It was exec fault */
 		vm_flags = VM_EXEC;
 		mm_flags |= FAULT_FLAG_INSTRUCTION;
+	} else if (is_gcs_fault(esr)) {
+		/*
+		 * The GCS permission on a page implies both read and
+		 * write so always handle any GCS fault as a write fault,
+		 * we need to trigger CoW even for GCS reads.
+		 */
+		vm_flags = VM_WRITE;
+		mm_flags |= FAULT_FLAG_WRITE;
 	} else if (is_write_abort(esr)) {
 		/* It was write fault */
 		vm_flags = VM_WRITE;
@@ -587,6 +620,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 	if (!vma)
 		goto lock_mmap;
 
+	if (is_invalid_gcs_access(vma, esr)) {
+		vma_end_read(vma);
+		fault = 0;
+		si_code = SEGV_ACCERR;
+		goto bad_area;
+	}
+
 	if (!(vma->vm_flags & vm_flags)) {
 		vma_end_read(vma);
 		fault = 0;

From fc84bc5378a8c852308fa022957b8976adb5aa6a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:00 +0100
Subject: [PATCH 020/168] arm64/gcs: Context switch GCS state for EL0

There are two registers controlling the GCS state of EL0, GCSPR_EL0 which
is the current GCS pointer and GCSCRE0_EL1 which has enable bits for the
specific GCS functionality enabled for EL0. Manage these on context switch
and process lifetime events, GCS is reset on exec().  Also ensure that
any changes to the GCS memory are visible to other PEs and that changes
from other PEs are visible on this one by issuing a GCSB DSYNC when
moving to or from a thread with GCS.

Since the current GCS configuration of a thread will be visible to
userspace we store the configuration in the format used with userspace
and provide a helper which configures the system register as needed.

On systems that support GCS we always allow access to GCSPR_EL0, this
facilitates reporting of GCS faults if userspace implements disabling of
GCS on error - the GCS can still be discovered and examined even if GCS
has been disabled.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-21-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/gcs.h       | 24 ++++++++++++
 arch/arm64/include/asm/processor.h |  6 +++
 arch/arm64/kernel/process.c        | 62 ++++++++++++++++++++++++++++++
 arch/arm64/mm/Makefile             |  1 +
 arch/arm64/mm/gcs.c                | 42 ++++++++++++++++++++
 5 files changed, 135 insertions(+)
 create mode 100644 arch/arm64/mm/gcs.c

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 7c5e95218db6..04594ef59dad 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -48,4 +48,28 @@ static inline u64 gcsss2(void)
 	return Xt;
 }
 
+#ifdef CONFIG_ARM64_GCS
+
+static inline bool task_gcs_el0_enabled(struct task_struct *task)
+{
+	return current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE;
+}
+
+void gcs_set_el0_mode(struct task_struct *task);
+void gcs_free(struct task_struct *task);
+void gcs_preserve_current_state(void);
+
+#else
+
+static inline bool task_gcs_el0_enabled(struct task_struct *task)
+{
+	return false;
+}
+
+static inline void gcs_set_el0_mode(struct task_struct *task) { }
+static inline void gcs_free(struct task_struct *task) { }
+static inline void gcs_preserve_current_state(void) { }
+
+#endif
+
 #endif
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 1438424f0064..5260788247d8 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -185,6 +185,12 @@ struct thread_struct {
 	u64			svcr;
 	u64			tpidr2_el0;
 	u64			por_el0;
+#ifdef CONFIG_ARM64_GCS
+	unsigned int		gcs_el0_mode;
+	u64			gcspr_el0;
+	u64			gcs_base;
+	u64			gcs_size;
+#endif
 };
 
 static inline unsigned int thread_get_vl(struct thread_struct *thread,
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 0540653fbf38..aedcf332f422 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -49,6 +49,7 @@
 #include <asm/cacheflush.h>
 #include <asm/exec.h>
 #include <asm/fpsimd.h>
+#include <asm/gcs.h>
 #include <asm/mmu_context.h>
 #include <asm/mte.h>
 #include <asm/processor.h>
@@ -280,6 +281,25 @@ static void flush_poe(void)
 	write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
 }
 
+#ifdef CONFIG_ARM64_GCS
+
+static void flush_gcs(void)
+{
+	if (!system_supports_gcs())
+		return;
+
+	gcs_free(current);
+	current->thread.gcs_el0_mode = 0;
+	write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
+	write_sysreg_s(0, SYS_GCSPR_EL0);
+}
+
+#else
+
+static void flush_gcs(void) { }
+
+#endif
+
 void flush_thread(void)
 {
 	fpsimd_flush_thread();
@@ -287,6 +307,7 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(current);
 	flush_tagged_addr_state();
 	flush_poe();
+	flush_gcs();
 }
 
 void arch_release_task_struct(struct task_struct *tsk)
@@ -484,6 +505,46 @@ static void entry_task_switch(struct task_struct *next)
 	__this_cpu_write(__entry_task, next);
 }
 
+#ifdef CONFIG_ARM64_GCS
+
+void gcs_preserve_current_state(void)
+{
+	current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0);
+}
+
+static void gcs_thread_switch(struct task_struct *next)
+{
+	if (!system_supports_gcs())
+		return;
+
+	/* GCSPR_EL0 is always readable */
+	gcs_preserve_current_state();
+	write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0);
+
+	if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode)
+		gcs_set_el0_mode(next);
+
+	/*
+	 * Ensure that GCS memory effects of the 'prev' thread are
+	 * ordered before other memory accesses with release semantics
+	 * (or preceded by a DMB) on the current PE. In addition, any
+	 * memory accesses with acquire semantics (or succeeded by a
+	 * DMB) are ordered before GCS memory effects of the 'next'
+	 * thread. This will ensure that the GCS memory effects are
+	 * visible to other PEs in case of migration.
+	 */
+	if (task_gcs_el0_enabled(current) || task_gcs_el0_enabled(next))
+		gcsb_dsync();
+}
+
+#else
+
+static void gcs_thread_switch(struct task_struct *next)
+{
+}
+
+#endif
+
 /*
  * Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of
  * CNTVCT, various other errata which require trapping all CNTVCT{,_EL0}
@@ -580,6 +641,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	cntkctl_thread_switch(prev, next);
 	ptrauth_thread_switch_user(next);
 	permission_overlay_switch(next);
+	gcs_thread_switch(next);
 
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 2fc8c6dd0407..fc92170a8f37 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_TRANS_TABLE)	+= trans_pgd.o
 obj-$(CONFIG_TRANS_TABLE)	+= trans_pgd-asm.o
 obj-$(CONFIG_DEBUG_VIRTUAL)	+= physaddr.o
 obj-$(CONFIG_ARM64_MTE)		+= mteswap.o
+obj-$(CONFIG_ARM64_GCS)		+= gcs.o
 KASAN_SANITIZE_physaddr.o	+= n
 
 obj-$(CONFIG_KASAN)		+= kasan_init.o
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
new file mode 100644
index 000000000000..f8f4f984a247
--- /dev/null
+++ b/arch/arm64/mm/gcs.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+
+#include <asm/cpufeature.h>
+#include <asm/page.h>
+
+/*
+ * Apply the GCS mode configured for the specified task to the
+ * hardware.
+ */
+void gcs_set_el0_mode(struct task_struct *task)
+{
+	u64 gcscre0_el1 = GCSCRE0_EL1_nTR;
+
+	if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)
+		gcscre0_el1 |= GCSCRE0_EL1_RVCHKEN | GCSCRE0_EL1_PCRSEL;
+
+	if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_WRITE)
+		gcscre0_el1 |= GCSCRE0_EL1_STREn;
+
+	if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_PUSH)
+		gcscre0_el1 |= GCSCRE0_EL1_PUSHMEn;
+
+	write_sysreg_s(gcscre0_el1, SYS_GCSCRE0_EL1);
+}
+
+void gcs_free(struct task_struct *task)
+{
+	if (!system_supports_gcs())
+		return;
+
+	if (task->thread.gcs_base)
+		vm_munmap(task->thread.gcs_base, task->thread.gcs_size);
+
+	task->thread.gcspr_el0 = 0;
+	task->thread.gcs_base = 0;
+	task->thread.gcs_size = 0;
+}

From 506496bcbb4204c9ff5cfe82b1b90e1f14366992 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:01 +0100
Subject: [PATCH 021/168] arm64/gcs: Ensure that new threads have a GCS

When a new thread is created by a thread with GCS enabled the GCS needs
to be specified along with the regular stack.

Unfortunately plain clone() is not extensible and existing clone3()
users will not specify a stack so all existing code would be broken if
we mandated specifying the stack explicitly.  For compatibility with
these cases and also x86 (which did not initially implement clone3()
support for shadow stacks) if no GCS is specified we will allocate one
so when a thread is created which has GCS enabled allocate one for it.
We follow the extensively discussed x86 implementation and allocate
min(RLIMIT_STACK/2, 2G).  Since the GCS only stores the call stack and not
any variables this should be more than sufficient for most applications.

GCSs allocated via this mechanism will be freed when the thread exits.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-22-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/gcs.h         |  9 ++++
 arch/arm64/include/asm/mmu_context.h |  9 ++++
 arch/arm64/kernel/process.c          | 32 +++++++++++++
 arch/arm64/mm/gcs.c                  | 69 ++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 04594ef59dad..c1f274fdb9c0 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -8,6 +8,8 @@
 #include <asm/types.h>
 #include <asm/uaccess.h>
 
+struct kernel_clone_args;
+
 static inline void gcsb_dsync(void)
 {
 	asm volatile(".inst 0xd503227f" : : : "memory");
@@ -58,6 +60,8 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task)
 void gcs_set_el0_mode(struct task_struct *task);
 void gcs_free(struct task_struct *task);
 void gcs_preserve_current_state(void);
+unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+				     const struct kernel_clone_args *args);
 
 #else
 
@@ -69,6 +73,11 @@ static inline bool task_gcs_el0_enabled(struct task_struct *task)
 static inline void gcs_set_el0_mode(struct task_struct *task) { }
 static inline void gcs_free(struct task_struct *task) { }
 static inline void gcs_preserve_current_state(void) { }
+static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+						   const struct kernel_clone_args *args)
+{
+	return -ENOTSUPP;
+}
 
 #endif
 
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 7c09d47e09cb..48b3d9553b67 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -20,6 +20,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
 #include <asm/daifflags.h>
+#include <asm/gcs.h>
 #include <asm/proc-fns.h>
 #include <asm/cputype.h>
 #include <asm/sysreg.h>
@@ -311,6 +312,14 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 	return por_el0_allows_pkey(vma_pkey(vma), write, execute);
 }
 
+#define deactivate_mm deactivate_mm
+static inline void deactivate_mm(struct task_struct *tsk,
+			struct mm_struct *mm)
+{
+	gcs_free(tsk);
+}
+
+
 #include <asm-generic/mmu_context.h>
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index aedcf332f422..fdd095480c3f 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -294,9 +294,35 @@ static void flush_gcs(void)
 	write_sysreg_s(0, SYS_GCSPR_EL0);
 }
 
+static int copy_thread_gcs(struct task_struct *p,
+			   const struct kernel_clone_args *args)
+{
+	unsigned long gcs;
+
+	if (!system_supports_gcs())
+		return 0;
+
+	p->thread.gcs_base = 0;
+	p->thread.gcs_size = 0;
+
+	gcs = gcs_alloc_thread_stack(p, args);
+	if (IS_ERR_VALUE(gcs))
+		return PTR_ERR((void *)gcs);
+
+	p->thread.gcs_el0_mode = current->thread.gcs_el0_mode;
+	p->thread.gcs_el0_locked = current->thread.gcs_el0_locked;
+
+	return 0;
+}
+
 #else
 
 static void flush_gcs(void) { }
+static int copy_thread_gcs(struct task_struct *p,
+			   const struct kernel_clone_args *args)
+{
+	return 0;
+}
 
 #endif
 
@@ -313,6 +339,7 @@ void flush_thread(void)
 void arch_release_task_struct(struct task_struct *tsk)
 {
 	fpsimd_release_task(tsk);
+	gcs_free(tsk);
 }
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
@@ -376,6 +403,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	unsigned long stack_start = args->stack;
 	unsigned long tls = args->tls;
 	struct pt_regs *childregs = task_pt_regs(p);
+	int ret;
 
 	memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
 
@@ -420,6 +448,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 			p->thread.uw.tp_value = tls;
 			p->thread.tpidr2_el0 = 0;
 		}
+
+		ret = copy_thread_gcs(p, args);
+		if (ret != 0)
+			return ret;
 	} else {
 		/*
 		 * A kthread has no context to ERET to, so ensure any buggy
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index f8f4f984a247..3c7a18f57ea9 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -5,9 +5,69 @@
 #include <linux/syscalls.h>
 #include <linux/types.h>
 
+#include <asm/cmpxchg.h>
 #include <asm/cpufeature.h>
+#include <asm/gcs.h>
 #include <asm/page.h>
 
+static unsigned long alloc_gcs(unsigned long addr, unsigned long size)
+{
+	int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+	struct mm_struct *mm = current->mm;
+	unsigned long mapped_addr, unused;
+
+	if (addr)
+		flags |= MAP_FIXED_NOREPLACE;
+
+	mmap_write_lock(mm);
+	mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+			      VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+	mmap_write_unlock(mm);
+
+	return mapped_addr;
+}
+
+static unsigned long gcs_size(unsigned long size)
+{
+	if (size)
+		return PAGE_ALIGN(size);
+
+	/* Allocate RLIMIT_STACK/2 with limits of PAGE_SIZE..2G */
+	size = PAGE_ALIGN(min_t(unsigned long long,
+				rlimit(RLIMIT_STACK) / 2, SZ_2G));
+	return max(PAGE_SIZE, size);
+}
+
+unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+				     const struct kernel_clone_args *args)
+{
+	unsigned long addr, size;
+
+	if (!system_supports_gcs())
+		return 0;
+
+	if (!task_gcs_el0_enabled(tsk))
+		return 0;
+
+	if ((args->flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) {
+		tsk->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0);
+		return 0;
+	}
+
+	size = args->stack_size / 2;
+
+	size = gcs_size(size);
+	addr = alloc_gcs(0, size);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
+	tsk->thread.gcs_base = addr;
+	tsk->thread.gcs_size = size;
+	tsk->thread.gcspr_el0 = addr + size - sizeof(u64);
+
+	return addr;
+}
+
 /*
  * Apply the GCS mode configured for the specified task to the
  * hardware.
@@ -33,6 +93,15 @@ void gcs_free(struct task_struct *task)
 	if (!system_supports_gcs())
 		return;
 
+	/*
+	 * When fork() with CLONE_VM fails, the child (tsk) already
+	 * has a GCS allocated, and exit_thread() calls this function
+	 * to free it.  In this case the parent (current) and the
+	 * child share the same mm struct.
+	 */
+	if (!task->mm || task->mm != current->mm)
+		return;
+
 	if (task->thread.gcs_base)
 		vm_munmap(task->thread.gcs_base, task->thread.gcs_size);
 

From b57180c75c7ebff6613886cb69ef6e283a10358b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:02 +0100
Subject: [PATCH 022/168] arm64/gcs: Implement shadow stack prctl() interface

Implement the architecture neutral prctl() interface for setting the
shadow stack status, this supports setting and reading the current GCS
configuration for the current thread.

Userspace can enable basic GCS functionality and additionally also
support for GCS pushes and arbitrary GCS stores.  It is expected that
this prctl() will be called very early in application startup, for
example by the dynamic linker, and not subsequently adjusted during
normal operation.  Users should carefully note that after enabling GCS
for a thread GCS will become active with no call stack so it is not
normally possible to return from the function that invoked the prctl().

State is stored per thread, enabling GCS for a thread causes a GCS to be
allocated for that thread.

Userspace may lock the current GCS configuration by specifying
PR_SHADOW_STACK_ENABLE_LOCK, this prevents any further changes to the
GCS configuration via any means.

If GCS is not being enabled then all flags other than _LOCK are ignored,
it is not possible to enable stores or pops without enabling GCS.

When disabling the GCS we do not free the allocated stack, this allows
for inspection of the GCS after disabling as part of fault reporting.
Since it is not an expected use case and since it presents some
complications in determining what to do with previously initialsed data
on the GCS attempts to reenable GCS after this are rejected.  This can
be revisted if a use case arises.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-23-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/gcs.h       | 22 +++++++++
 arch/arm64/include/asm/processor.h |  1 +
 arch/arm64/mm/gcs.c                | 79 ++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index c1f274fdb9c0..48c97e63e56a 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -50,6 +50,9 @@ static inline u64 gcsss2(void)
 	return Xt;
 }
 
+#define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK \
+	(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH)
+
 #ifdef CONFIG_ARM64_GCS
 
 static inline bool task_gcs_el0_enabled(struct task_struct *task)
@@ -63,6 +66,20 @@ void gcs_preserve_current_state(void);
 unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
 				     const struct kernel_clone_args *args);
 
+static inline int gcs_check_locked(struct task_struct *task,
+				   unsigned long new_val)
+{
+	unsigned long cur_val = task->thread.gcs_el0_mode;
+
+	cur_val &= task->thread.gcs_el0_locked;
+	new_val &= task->thread.gcs_el0_locked;
+
+	if (cur_val != new_val)
+		return -EBUSY;
+
+	return 0;
+}
+
 #else
 
 static inline bool task_gcs_el0_enabled(struct task_struct *task)
@@ -78,6 +95,11 @@ static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
 {
 	return -ENOTSUPP;
 }
+static inline int gcs_check_locked(struct task_struct *task,
+				   unsigned long new_val)
+{
+	return 0;
+}
 
 #endif
 
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 5260788247d8..37fefdc3d3a3 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -187,6 +187,7 @@ struct thread_struct {
 	u64			por_el0;
 #ifdef CONFIG_ARM64_GCS
 	unsigned int		gcs_el0_mode;
+	unsigned int		gcs_el0_locked;
 	u64			gcspr_el0;
 	u64			gcs_base;
 	u64			gcs_size;
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 3c7a18f57ea9..61a80de6baf8 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -109,3 +109,82 @@ void gcs_free(struct task_struct *task)
 	task->thread.gcs_base = 0;
 	task->thread.gcs_size = 0;
 }
+
+int arch_set_shadow_stack_status(struct task_struct *task, unsigned long arg)
+{
+	unsigned long gcs, size;
+	int ret;
+
+	if (!system_supports_gcs())
+		return -EINVAL;
+
+	if (is_compat_thread(task_thread_info(task)))
+		return -EINVAL;
+
+	/* Reject unknown flags */
+	if (arg & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+		return -EINVAL;
+
+	ret = gcs_check_locked(task, arg);
+	if (ret != 0)
+		return ret;
+
+	/* If we are enabling GCS then make sure we have a stack */
+	if (arg & PR_SHADOW_STACK_ENABLE &&
+	    !task_gcs_el0_enabled(task)) {
+		/* Do not allow GCS to be reenabled */
+		if (task->thread.gcs_base || task->thread.gcspr_el0)
+			return -EINVAL;
+
+		if (task != current)
+			return -EBUSY;
+
+		size = gcs_size(0);
+		gcs = alloc_gcs(0, size);
+		if (!gcs)
+			return -ENOMEM;
+
+		task->thread.gcspr_el0 = gcs + size - sizeof(u64);
+		task->thread.gcs_base = gcs;
+		task->thread.gcs_size = size;
+		if (task == current)
+			write_sysreg_s(task->thread.gcspr_el0,
+				       SYS_GCSPR_EL0);
+	}
+
+	task->thread.gcs_el0_mode = arg;
+	if (task == current)
+		gcs_set_el0_mode(task);
+
+	return 0;
+}
+
+int arch_get_shadow_stack_status(struct task_struct *task,
+				 unsigned long __user *arg)
+{
+	if (!system_supports_gcs())
+		return -EINVAL;
+
+	if (is_compat_thread(task_thread_info(task)))
+		return -EINVAL;
+
+	return put_user(task->thread.gcs_el0_mode, arg);
+}
+
+int arch_lock_shadow_stack_status(struct task_struct *task,
+				  unsigned long arg)
+{
+	if (!system_supports_gcs())
+		return -EINVAL;
+
+	if (is_compat_thread(task_thread_info(task)))
+		return -EINVAL;
+
+	/*
+	 * We support locking unknown bits so applications can prevent
+	 * any changes in a future proof manner.
+	 */
+	task->thread.gcs_el0_locked |= arg;
+
+	return 0;
+}

From 8f3e750673b21ff0613af8b02028200199f3144c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:03 +0100
Subject: [PATCH 023/168] arm64/mm: Implement map_shadow_stack()

As discussed extensively in the changelog for the addition of this
syscall on x86 ("x86/shstk: Introduce map_shadow_stack syscall") the
existing mmap() and madvise() syscalls do not map entirely well onto the
security requirements for guarded control stacks since they lead to
windows where memory is allocated but not yet protected or stacks which
are not properly and safely initialised. Instead a new syscall
map_shadow_stack() has been defined which allocates and initialises a
shadow stack page.

Implement this for arm64.  Two flags are provided, allowing applications
to request that the stack be initialised with a valid cap token at the
top of the stack and optionally also an end of stack marker above that.
We support requesting an end of stack marker alone but since this is a
NULL pointer it is indistinguishable from not initialising anything by
itself.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-24-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/gcs.c | 64 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
index 61a80de6baf8..5c46ec527b1c 100644
--- a/arch/arm64/mm/gcs.c
+++ b/arch/arm64/mm/gcs.c
@@ -68,6 +68,70 @@ unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
 	return addr;
 }
 
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
+{
+	unsigned long alloc_size;
+	unsigned long __user *cap_ptr;
+	unsigned long cap_val;
+	int ret = 0;
+	int cap_offset;
+
+	if (!system_supports_gcs())
+		return -EOPNOTSUPP;
+
+	if (flags & ~(SHADOW_STACK_SET_TOKEN | SHADOW_STACK_SET_MARKER))
+		return -EINVAL;
+
+	if (!PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	if (size == 8 || !IS_ALIGNED(size, 8))
+		return -EINVAL;
+
+	/*
+	 * An overflow would result in attempting to write the restore token
+	 * to the wrong location. Not catastrophic, but just return the right
+	 * error code and block it.
+	 */
+	alloc_size = PAGE_ALIGN(size);
+	if (alloc_size < size)
+		return -EOVERFLOW;
+
+	addr = alloc_gcs(addr, alloc_size);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
+	/*
+	 * Put a cap token at the end of the allocated region so it
+	 * can be switched to.
+	 */
+	if (flags & SHADOW_STACK_SET_TOKEN) {
+		/* Leave an extra empty frame as a top of stack marker? */
+		if (flags & SHADOW_STACK_SET_MARKER)
+			cap_offset = 2;
+		else
+			cap_offset = 1;
+
+		cap_ptr = (unsigned long __user *)(addr + size -
+						   (cap_offset * sizeof(unsigned long)));
+		cap_val = GCS_CAP(cap_ptr);
+
+		put_user_gcs(cap_val, cap_ptr, &ret);
+		if (ret != 0) {
+			vm_munmap(addr, size);
+			return -EFAULT;
+		}
+
+		/*
+		 * Ensure the new cap is ordered before standard
+		 * memory accesses to the same location.
+		 */
+		gcsb_dsync();
+	}
+
+	return addr;
+}
+
 /*
  * Apply the GCS mode configured for the specified task to the
  * hardware.

From eaf62ce1563b8557e3550acb97d5086120168750 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:04 +0100
Subject: [PATCH 024/168] arm64/signal: Set up and restore the GCS context for
 signal handlers

When invoking a signal handler we use the GCS configuration and stack
for the current thread.

Since we implement signal return by calling the signal handler with a
return address set up pointing to a trampoline in the vDSO we need to
also configure any active GCS for this by pushing a frame for the
trampoline onto the GCS.  If we do not do this then signal return will
generate a GCS protection fault.

In order to guard against attempts to bypass GCS protections via signal
return we only allow returning with GCSPR_EL0 pointing to an address
where it was previously preempted by a signal.  We do this by pushing a
cap onto the GCS, this takes the form of an architectural GCS cap token
with the top bit set and token type of 0 which we add on signal entry
and validate and pop off on signal return.  The combination of the top
bit being set and the token type mean that this can't be interpreted as
a valid token or address.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-25-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/gcs.h |   1 +
 arch/arm64/kernel/signal.c   | 118 +++++++++++++++++++++++++++++++++--
 2 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
index 48c97e63e56a..f50660603ecf 100644
--- a/arch/arm64/include/asm/gcs.h
+++ b/arch/arm64/include/asm/gcs.h
@@ -9,6 +9,7 @@
 #include <asm/uaccess.h>
 
 struct kernel_clone_args;
+struct ksignal;
 
 static inline void gcsb_dsync(void)
 {
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 561986947530..b5ab0e229a78 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <asm/elf.h>
 #include <asm/exception.h>
 #include <asm/cacheflush.h>
+#include <asm/gcs.h>
 #include <asm/ucontext.h>
 #include <asm/unistd.h>
 #include <asm/fpsimd.h>
@@ -34,6 +35,15 @@
 #include <asm/traps.h>
 #include <asm/vdso.h>
 
+#ifdef CONFIG_ARM64_GCS
+#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK)
+
+static bool gcs_signal_cap_valid(u64 addr, u64 val)
+{
+	return val == GCS_SIGNAL_CAP(addr);
+}
+#endif
+
 /*
  * Do a signal return; undo the signal stack. These are aligned to 128-bit.
  */
@@ -904,6 +914,58 @@ static int restore_sigframe(struct pt_regs *regs,
 	return err;
 }
 
+#ifdef CONFIG_ARM64_GCS
+static int gcs_restore_signal(void)
+{
+	unsigned long __user *gcspr_el0;
+	u64 cap;
+	int ret;
+
+	if (!system_supports_gcs())
+		return 0;
+
+	if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE))
+		return 0;
+
+	gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0);
+
+	/*
+	 * Ensure that any changes to the GCS done via GCS operations
+	 * are visible to the normal reads we do to validate the
+	 * token.
+	 */
+	gcsb_dsync();
+
+	/*
+	 * GCSPR_EL0 should be pointing at a capped GCS, read the cap.
+	 * We don't enforce that this is in a GCS page, if it is not
+	 * then faults will be generated on GCS operations - the main
+	 * concern is to protect GCS pages.
+	 */
+	ret = copy_from_user(&cap, gcspr_el0, sizeof(cap));
+	if (ret)
+		return -EFAULT;
+
+	/*
+	 * Check that the cap is the actual GCS before replacing it.
+	 */
+	if (!gcs_signal_cap_valid((u64)gcspr_el0, cap))
+		return -EINVAL;
+
+	/* Invalidate the token to prevent reuse */
+	put_user_gcs(0, (__user void*)gcspr_el0, &ret);
+	if (ret != 0)
+		return -EFAULT;
+
+	write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0);
+
+	return 0;
+}
+
+#else
+static int gcs_restore_signal(void) { return 0; }
+#endif
+
 SYSCALL_DEFINE0(rt_sigreturn)
 {
 	struct pt_regs *regs = current_pt_regs();
@@ -927,6 +989,9 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	if (restore_sigframe(regs, frame))
 		goto badframe;
 
+	if (gcs_restore_signal())
+		goto badframe;
+
 	if (restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
 
@@ -1189,7 +1254,48 @@ static int get_sigframe(struct rt_sigframe_user_layout *user,
 	return 0;
 }
 
-static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
+#ifdef CONFIG_ARM64_GCS
+
+static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig)
+{
+	unsigned long __user *gcspr_el0;
+	int ret = 0;
+
+	if (!system_supports_gcs())
+		return 0;
+
+	if (!task_gcs_el0_enabled(current))
+		return 0;
+
+	/*
+	 * We are entering a signal handler, current register state is
+	 * active.
+	 */
+	gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0);
+
+	/*
+	 * Push a cap and the GCS entry for the trampoline onto the GCS.
+	 */
+	put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret);
+	put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret);
+	if (ret != 0)
+		return ret;
+
+	gcspr_el0 -= 2;
+	write_sysreg_s((unsigned long)gcspr_el0, SYS_GCSPR_EL0);
+
+	return 0;
+}
+#else
+
+static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig)
+{
+	return 0;
+}
+
+#endif
+
+static int setup_return(struct pt_regs *regs, struct ksignal *ksig,
 			 struct rt_sigframe_user_layout *user, int usig)
 {
 	__sigrestore_t sigtramp;
@@ -1197,7 +1303,7 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 	regs->regs[0] = usig;
 	regs->sp = (unsigned long)user->sigframe;
 	regs->regs[29] = (unsigned long)&user->next_frame->fp;
-	regs->pc = (unsigned long)ka->sa.sa_handler;
+	regs->pc = (unsigned long)ksig->ka.sa.sa_handler;
 
 	/*
 	 * Signal delivery is a (wacky) indirect function call in
@@ -1240,12 +1346,14 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 	if (system_supports_poe())
 		write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
 
-	if (ka->sa.sa_flags & SA_RESTORER)
-		sigtramp = ka->sa.sa_restorer;
+	if (ksig->ka.sa.sa_flags & SA_RESTORER)
+		sigtramp = ksig->ka.sa.sa_restorer;
 	else
 		sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp);
 
 	regs->regs[30] = (unsigned long)sigtramp;
+
+	return gcs_signal_entry(sigtramp, ksig);
 }
 
 static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
@@ -1268,7 +1376,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
 	err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
 	err |= setup_sigframe(&user, regs, set);
 	if (err == 0) {
-		setup_return(regs, &ksig->ka, &user, usig);
+		err = setup_return(regs, ksig, &user, usig);
 		if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
 			err |= copy_siginfo_to_user(&frame->info, &ksig->info);
 			regs->regs[1] = (unsigned long)&frame->info;

From 16f47bb9ac8afe09e7ca14cc53748f779b2a12e0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:05 +0100
Subject: [PATCH 025/168] arm64/signal: Expose GCS state in signal frames

Add a context for the GCS state and include it in the signal context when
running on a system that supports GCS. We reuse the same flags that the
prctl() uses to specify which GCS features are enabled and also provide the
current GCS pointer.

We do not support enabling GCS via signal return, there is a conflict
between specifying GCSPR_EL0 and allocation of a new GCS and this is not
an ancticipated use case.  We also enforce GCS configuration locking on
signal return.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Acked-by: Yury Khrustalev <yury.khrustalev@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-26-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/uapi/asm/sigcontext.h |   9 ++
 arch/arm64/kernel/signal.c               | 109 +++++++++++++++++++++++
 2 files changed, 118 insertions(+)

diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index bb7af77a30a7..d42f7a92238b 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -183,6 +183,15 @@ struct zt_context {
 	__u16 __reserved[3];
 };
 
+#define GCS_MAGIC	0x47435300
+
+struct gcs_context {
+	struct _aarch64_ctx head;
+	__u64 gcspr;
+	__u64 features_enabled;
+	__u64 reserved;
+};
+
 #endif /* !__ASSEMBLY__ */
 
 #include <asm/sve_context.h>
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index b5ab0e229a78..62d666278264 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -66,6 +66,7 @@ struct rt_sigframe_user_layout {
 
 	unsigned long fpsimd_offset;
 	unsigned long esr_offset;
+	unsigned long gcs_offset;
 	unsigned long sve_offset;
 	unsigned long tpidr2_offset;
 	unsigned long za_offset;
@@ -198,6 +199,8 @@ struct user_ctxs {
 	u32 fpmr_size;
 	struct poe_context __user *poe;
 	u32 poe_size;
+	struct gcs_context __user *gcs;
+	u32 gcs_size;
 };
 
 static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
@@ -643,6 +646,82 @@ extern int restore_zt_context(struct user_ctxs *user);
 
 #endif /* ! CONFIG_ARM64_SME */
 
+#ifdef CONFIG_ARM64_GCS
+
+static int preserve_gcs_context(struct gcs_context __user *ctx)
+{
+	int err = 0;
+	u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0);
+
+	/*
+	 * If GCS is enabled we will add a cap token to the frame,
+	 * include it in the GCSPR_EL0 we report to support stack
+	 * switching via sigreturn if GCS is enabled.  We do not allow
+	 * enabling via sigreturn so the token is only relevant for
+	 * threads with GCS enabled.
+	 */
+	if (task_gcs_el0_enabled(current))
+		gcspr -= 8;
+
+	__put_user_error(GCS_MAGIC, &ctx->head.magic, err);
+	__put_user_error(sizeof(*ctx), &ctx->head.size, err);
+	__put_user_error(gcspr, &ctx->gcspr, err);
+	__put_user_error(0, &ctx->reserved, err);
+	__put_user_error(current->thread.gcs_el0_mode,
+			 &ctx->features_enabled, err);
+
+	return err;
+}
+
+static int restore_gcs_context(struct user_ctxs *user)
+{
+	u64 gcspr, enabled;
+	int err = 0;
+
+	if (user->gcs_size != sizeof(*user->gcs))
+		return -EINVAL;
+
+	__get_user_error(gcspr, &user->gcs->gcspr, err);
+	__get_user_error(enabled, &user->gcs->features_enabled, err);
+	if (err)
+		return err;
+
+	/* Don't allow unknown modes */
+	if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+		return -EINVAL;
+
+	err = gcs_check_locked(current, enabled);
+	if (err != 0)
+		return err;
+
+	/* Don't allow enabling */
+	if (!task_gcs_el0_enabled(current) &&
+	    (enabled & PR_SHADOW_STACK_ENABLE))
+		return -EINVAL;
+
+	/* If we are disabling disable everything */
+	if (!(enabled & PR_SHADOW_STACK_ENABLE))
+		enabled = 0;
+
+	current->thread.gcs_el0_mode = enabled;
+
+	/*
+	 * We let userspace set GCSPR_EL0 to anything here, we will
+	 * validate later in gcs_restore_signal().
+	 */
+	write_sysreg_s(gcspr, SYS_GCSPR_EL0);
+
+	return 0;
+}
+
+#else /* ! CONFIG_ARM64_GCS */
+
+/* Turn any non-optimised out attempts to use these into a link error: */
+extern int preserve_gcs_context(void __user *ctx);
+extern int restore_gcs_context(struct user_ctxs *user);
+
+#endif /* ! CONFIG_ARM64_GCS */
+
 static int parse_user_sigframe(struct user_ctxs *user,
 			       struct rt_sigframe __user *sf)
 {
@@ -661,6 +740,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
 	user->zt = NULL;
 	user->fpmr = NULL;
 	user->poe = NULL;
+	user->gcs = NULL;
 
 	if (!IS_ALIGNED((unsigned long)base, 16))
 		goto invalid;
@@ -777,6 +857,17 @@ static int parse_user_sigframe(struct user_ctxs *user,
 			user->fpmr_size = size;
 			break;
 
+		case GCS_MAGIC:
+			if (!system_supports_gcs())
+				goto invalid;
+
+			if (user->gcs)
+				goto invalid;
+
+			user->gcs = (struct gcs_context __user *)head;
+			user->gcs_size = size;
+			break;
+
 		case EXTRA_MAGIC:
 			if (have_extra_context)
 				goto invalid;
@@ -896,6 +987,9 @@ static int restore_sigframe(struct pt_regs *regs,
 			err = restore_fpsimd_context(&user);
 	}
 
+	if (err == 0 && system_supports_gcs() && user.gcs)
+		err = restore_gcs_context(&user);
+
 	if (err == 0 && system_supports_tpidr2() && user.tpidr2)
 		err = restore_tpidr2_context(&user);
 
@@ -1029,6 +1123,15 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
 			return err;
 	}
 
+#ifdef CONFIG_ARM64_GCS
+	if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) {
+		err = sigframe_alloc(user, &user->gcs_offset,
+				     sizeof(struct gcs_context));
+		if (err)
+			return err;
+	}
+#endif
+
 	if (system_supports_sve() || system_supports_sme()) {
 		unsigned int vq = 0;
 
@@ -1136,6 +1239,12 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
 	}
 
+	if (system_supports_gcs() && err == 0 && user->gcs_offset) {
+		struct gcs_context __user *gcs_ctx =
+			apply_user_offset(user, user->gcs_offset);
+		err |= preserve_gcs_context(gcs_ctx);
+	}
+
 	/* Scalable Vector Extension state (including streaming), if present */
 	if ((system_supports_sve() || system_supports_sme()) &&
 	    err == 0 && user->sve_offset) {

From 7ec3b57cb29f8371bf12a725b6e8f75831a03f27 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:06 +0100
Subject: [PATCH 026/168] arm64/ptrace: Expose GCS via ptrace and core files

Provide a new register type NT_ARM_GCS reporting the current GCS mode
and pointer for EL0.  Due to the interactions with allocation and
deallocation of Guarded Control Stacks we do not permit any changes to
the GCS mode via ptrace, only GCSPR_EL0 may be changed.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-27-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/uapi/asm/ptrace.h |  8 ++++
 arch/arm64/kernel/ptrace.c           | 62 +++++++++++++++++++++++++++-
 include/uapi/linux/elf.h             |  1 +
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 7fa2f7036aa7..0f39ba4f3efd 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -324,6 +324,14 @@ struct user_za_header {
 #define ZA_PT_SIZE(vq)						\
 	(ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq))
 
+/* GCS state (NT_ARM_GCS) */
+
+struct user_gcs {
+	__u64 features_enabled;
+	__u64 features_locked;
+	__u64 gcspr_el0;
+};
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b756578aeaee..6c1dcfe6d25a 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -34,6 +34,7 @@
 #include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
 #include <asm/fpsimd.h>
+#include <asm/gcs.h>
 #include <asm/mte.h>
 #include <asm/pointer_auth.h>
 #include <asm/stacktrace.h>
@@ -1473,6 +1474,52 @@ static int poe_set(struct task_struct *target, const struct
 }
 #endif
 
+#ifdef CONFIG_ARM64_GCS
+static int gcs_get(struct task_struct *target,
+		   const struct user_regset *regset,
+		   struct membuf to)
+{
+	struct user_gcs user_gcs;
+
+	if (!system_supports_gcs())
+		return -EINVAL;
+
+	if (target == current)
+		gcs_preserve_current_state();
+
+	user_gcs.features_enabled = target->thread.gcs_el0_mode;
+	user_gcs.features_locked = target->thread.gcs_el0_locked;
+	user_gcs.gcspr_el0 = target->thread.gcspr_el0;
+
+	return membuf_write(&to, &user_gcs, sizeof(user_gcs));
+}
+
+static int gcs_set(struct task_struct *target, const struct
+		   user_regset *regset, unsigned int pos,
+		   unsigned int count, const void *kbuf, const
+		   void __user *ubuf)
+{
+	int ret;
+	struct user_gcs user_gcs;
+
+	if (!system_supports_gcs())
+		return -EINVAL;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1);
+	if (ret)
+		return ret;
+
+	if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+		return -EINVAL;
+
+	target->thread.gcs_el0_mode = user_gcs.features_enabled;
+	target->thread.gcs_el0_locked = user_gcs.features_locked;
+	target->thread.gcspr_el0 = user_gcs.gcspr_el0;
+
+	return 0;
+}
+#endif
+
 enum aarch64_regset {
 	REGSET_GPR,
 	REGSET_FPR,
@@ -1503,7 +1550,10 @@ enum aarch64_regset {
 	REGSET_TAGGED_ADDR_CTRL,
 #endif
 #ifdef CONFIG_ARM64_POE
-	REGSET_POE
+	REGSET_POE,
+#endif
+#ifdef CONFIG_ARM64_GCS
+	REGSET_GCS,
 #endif
 };
 
@@ -1674,6 +1724,16 @@ static const struct user_regset aarch64_regsets[] = {
 		.set = poe_set,
 	},
 #endif
+#ifdef CONFIG_ARM64_GCS
+	[REGSET_GCS] = {
+		.core_note_type = NT_ARM_GCS,
+		.n = sizeof(struct user_gcs) / sizeof(u64),
+		.size = sizeof(u64),
+		.align = sizeof(u64),
+		.regset_get = gcs_get,
+		.set = gcs_set,
+	},
+#endif
 };
 
 static const struct user_regset_view user_aarch64_view = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b9935988da5c..9adc218fb6df 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -443,6 +443,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_ZT	0x40d		/* ARM SME ZT registers */
 #define NT_ARM_FPMR	0x40e		/* ARM floating point mode register */
 #define NT_ARM_POE	0x40f		/* ARM POE registers */
+#define NT_ARM_GCS	0x410		/* ARM GCS state */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
 #define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 #define NT_MIPS_DSP	0x800		/* MIPS DSP ASE registers */

From 5d8b172e7005c6b42c16a0952c1d8873051d68ae Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:07 +0100
Subject: [PATCH 027/168] arm64: Add Kconfig for Guarded Control Stack (GCS)

Provide a Kconfig option allowing the user to select if GCS support is
built into the kernel.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-28-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e29b44d2d7b..dcb12f041c13 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2178,6 +2178,27 @@ config ARCH_PKEY_BITS
 
 endmenu # "ARMv8.9 architectural features"
 
+menu "v9.4 architectural features"
+
+config ARM64_GCS
+	bool "Enable support for Guarded Control Stack (GCS)"
+	default y
+	select ARCH_HAS_USER_SHADOW_STACK
+	select ARCH_USES_HIGH_VMA_FLAGS
+	depends on !UPROBES
+	help
+	  Guarded Control Stack (GCS) provides support for a separate
+	  stack with restricted access which contains only return
+	  addresses.  This can be used to harden against some attacks
+	  by comparing return address used by the program with what is
+	  stored in the GCS, and may also be used to efficiently obtain
+	  the call stack for applications such as profiling.
+
+	  The feature is detected at runtime, and will remain disabled
+	  if the system does not implement the feature.
+
+endmenu # "v9.4 architectural features"
+
 config ARM64_SVE
 	bool "ARM Scalable Vector Extension support"
 	default y

From 7a2f671db61f32de0671eeb163a7764e5a258114 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:08 +0100
Subject: [PATCH 028/168] kselftest/arm64: Verify the GCS hwcap

Add coverage of the GCS hwcap to the hwcap selftest, using a read of
GCSPR_EL0 to generate SIGILL without having to worry about enabling GCS.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-29-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index f2d6007a2b98..1f07772ae578 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -98,6 +98,17 @@ static void fpmr_sigill(void)
 	asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0");
 }
 
+static void gcs_sigill(void)
+{
+	unsigned long *gcspr;
+
+	asm volatile(
+		"mrs	%0, S3_3_C2_C5_1"
+	: "=r" (gcspr)
+	:
+	: "cc");
+}
+
 static void ilrcpc_sigill(void)
 {
 	/* LDAPUR W0, [SP, #8] */
@@ -534,6 +545,14 @@ static const struct hwcap_data {
 		.sigill_fn = fpmr_sigill,
 		.sigill_reliable = true,
 	},
+	{
+		.name = "GCS",
+		.at_hwcap = AT_HWCAP,
+		.hwcap_bit = HWCAP_GCS,
+		.cpuinfo = "gcs",
+		.sigill_fn = gcs_sigill,
+		.sigill_reliable = true,
+	},
 	{
 		.name = "JSCVT",
 		.at_hwcap = AT_HWCAP,

From b2d2f11ff5d69cd4b3585ddab4bec9f69503f680 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:09 +0100
Subject: [PATCH 029/168] kselftest/arm64: Add GCS as a detected feature in the
 signal tests

In preparation for testing GCS related signal handling add it as a feature
we check for in the signal handling support code.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-30-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/test_signals.h       | 2 ++
 tools/testing/selftests/arm64/signal/test_signals_utils.c | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h
index 1e6273d81575..7ada43688c02 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.h
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -35,6 +35,7 @@ enum {
 	FSME_BIT,
 	FSME_FA64_BIT,
 	FSME2_BIT,
+	FGCS_BIT,
 	FMAX_END
 };
 
@@ -43,6 +44,7 @@ enum {
 #define FEAT_SME		(1UL << FSME_BIT)
 #define FEAT_SME_FA64		(1UL << FSME_FA64_BIT)
 #define FEAT_SME2		(1UL << FSME2_BIT)
+#define FEAT_GCS		(1UL << FGCS_BIT)
 
 /*
  * A descriptor used to describe and configure a test case.
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 0dc948db3a4a..dcc49e3ce1eb 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -30,6 +30,7 @@ static char const *const feats_names[FMAX_END] = {
 	" SME ",
 	" FA64 ",
 	" SME2 ",
+	" GCS ",
 };
 
 #define MAX_FEATS_SZ	128
@@ -329,6 +330,8 @@ int test_init(struct tdescr *td)
 			td->feats_supported |= FEAT_SME_FA64;
 		if (getauxval(AT_HWCAP2) & HWCAP2_SME2)
 			td->feats_supported |= FEAT_SME2;
+		if (getauxval(AT_HWCAP) & HWCAP_GCS)
+			td->feats_supported |= FEAT_GCS;
 		if (feats_ok(td)) {
 			if (td->feats_required & td->feats_supported)
 				fprintf(stderr,

From 0d426f7dd9a0d88aa39c1dd54a6bf10f0466c6b9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:10 +0100
Subject: [PATCH 030/168] kselftest/arm64: Add framework support for GCS to
 signal handling tests

Teach the framework about the GCS signal context, avoiding warnings on
the unknown context.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-31-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/testcases/testcases.c | 7 +++++++
 tools/testing/selftests/arm64/signal/testcases/testcases.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index e6daa94fcd2e..0c1a6b26afac 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -198,6 +198,13 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
 				*err = "Bad size for fpmr_context";
 			new_flags |= FPMR_CTX;
 			break;
+		case GCS_MAGIC:
+			if (flags & GCS_CTX)
+				*err = "Multiple GCS_MAGIC";
+			if (head->size != sizeof(struct gcs_context))
+				*err = "Bad size for gcs_context";
+			new_flags |= GCS_CTX;
+			break;
 		case EXTRA_MAGIC:
 			if (flags & EXTRA_CTX)
 				*err = "Multiple EXTRA_MAGIC";
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index 9872b8912714..98b97efdda23 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -20,6 +20,7 @@
 #define EXTRA_CTX	(1 << 3)
 #define ZT_CTX		(1 << 4)
 #define FPMR_CTX	(1 << 5)
+#define GCS_CTX		(1 << 6)
 
 #define KSFT_BAD_MAGIC	0xdeadbeef
 

From 956573ac189066a32326245ebf5abf35b64a490f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:11 +0100
Subject: [PATCH 031/168] kselftest/arm64: Allow signals tests to specify an
 expected si_code

Currently we ignore si_code unless the expected signal is a SIGSEGV, in
which case we enforce it being SEGV_ACCERR. Allow test cases to specify
exactly which si_code should be generated so we can validate this, and
test for other segfault codes.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-32-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../selftests/arm64/signal/test_signals.h     |  4 +++
 .../arm64/signal/test_signals_utils.c         | 29 ++++++++++++-------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h
index 7ada43688c02..ee75a2c25ce7 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.h
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -71,6 +71,10 @@ struct tdescr {
 	 * Zero when no signal is expected on success
 	 */
 	int			sig_ok;
+	/*
+	 * expected si_code for sig_ok, or 0 to not check
+	 */
+	int			sig_ok_code;
 	/* signum expected on unsupported CPU features. */
 	int			sig_unsupp;
 	/* a timeout in second for test completion */
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index dcc49e3ce1eb..5d3621921cfe 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -143,16 +143,25 @@ static bool handle_signal_ok(struct tdescr *td,
 			"current->token ZEROED...test is probably broken!\n");
 		abort();
 	}
-	/*
-	 * Trying to narrow down the SEGV to the ones generated by Kernel itself
-	 * via arm64_notify_segfault(). This is a best-effort check anyway, and
-	 * the si_code check may need to change if this aspect of the kernel
-	 * ABI changes.
-	 */
-	if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
-		fprintf(stdout,
-			"si_code != SEGV_ACCERR...test is probably broken!\n");
-		abort();
+	if (td->sig_ok_code) {
+		if (si->si_code != td->sig_ok_code) {
+			fprintf(stdout, "si_code is %d not %d\n",
+				si->si_code, td->sig_ok_code);
+			abort();
+		}
+	} else {
+		/*
+		 * Trying to narrow down the SEGV to the ones
+		 * generated by Kernel itself via
+		 * arm64_notify_segfault(). This is a best-effort
+		 * check anyway, and the si_code check may need to
+		 * change if this aspect of the kernel ABI changes.
+		 */
+		if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
+			fprintf(stdout,
+				"si_code != SEGV_ACCERR...test is probably broken!\n");
+			abort();
+		}
 	}
 	td->pass = 1;
 	/*

From 42155a8eb0f63f634a98ad17a85e9f2826bcff11 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:12 +0100
Subject: [PATCH 032/168] kselftest/arm64: Always run signals tests with GCS
 enabled

Since it is not possible to return from the function that enabled GCS
without disabling GCS it is very inconvenient to use the signal handling
tests to cover GCS when GCS is not enabled by the toolchain and runtime,
something that no current distribution does. Since none of the testcases
do anything with stacks that would cause problems with GCS we can sidestep
this issue by unconditionally enabling GCS on startup and exiting with a
call to exit() rather than a return from main().

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-33-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../selftests/arm64/signal/test_signals.c     | 17 ++++++++++-
 .../arm64/signal/test_signals_utils.h         | 29 +++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/signal/test_signals.c b/tools/testing/selftests/arm64/signal/test_signals.c
index 00051b40d71e..1304c8ec0f2f 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.c
+++ b/tools/testing/selftests/arm64/signal/test_signals.c
@@ -7,6 +7,10 @@
  * Each test provides its own tde struct tdescr descriptor to link with
  * this wrapper. Framework provides common helpers.
  */
+
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+
 #include <kselftest.h>
 
 #include "test_signals.h"
@@ -16,6 +20,16 @@ struct tdescr *current = &tde;
 
 int main(int argc, char *argv[])
 {
+	/*
+	 * Ensure GCS is at least enabled throughout the tests if
+	 * supported, otherwise the inability to return from the
+	 * function that enabled GCS makes it very inconvenient to set
+	 * up test cases.  The prctl() may fail if GCS was locked by
+	 * libc setup code.
+	 */
+	if (getauxval(AT_HWCAP) & HWCAP_GCS)
+		gcs_set_state(PR_SHADOW_STACK_ENABLE);
+
 	ksft_print_msg("%s :: %s\n", current->name, current->descr);
 	if (test_setup(current) && test_init(current)) {
 		test_run(current);
@@ -23,5 +37,6 @@ int main(int argc, char *argv[])
 	}
 	test_result(current);
 
-	return current->result;
+	/* Do not return in case GCS was enabled */
+	exit(current->result);
 }
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h
index 762c8fe9c54a..1e80808ee105 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.h
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -18,6 +18,35 @@ void test_cleanup(struct tdescr *td);
 int test_run(struct tdescr *td);
 void test_result(struct tdescr *td);
 
+#ifndef __NR_prctl
+#define __NR_prctl 167
+#endif
+
+/*
+ * The prctl takes 1 argument but we need to ensure that the other
+ * values passed in registers to the syscall are zero since the kernel
+ * validates them.
+ */
+#define gcs_set_state(state)					\
+	({								\
+		register long _num  __asm__ ("x8") = __NR_prctl;	\
+		register long _arg1 __asm__ ("x0") =  PR_SET_SHADOW_STACK_STATUS; \
+		register long _arg2 __asm__ ("x1") = (long)(state);	\
+		register long _arg3 __asm__ ("x2") = 0;			\
+		register long _arg4 __asm__ ("x3") = 0;			\
+		register long _arg5 __asm__ ("x4") = 0;			\
+	                                                                      \
+		__asm__  volatile (					\
+			"svc #0\n"					\
+			: "=r"(_arg1)					\
+			: "r"(_arg1), "r"(_arg2),			\
+			  "r"(_arg3), "r"(_arg4),			\
+			  "r"(_arg5), "r"(_num)				\
+			: "memory", "cc"				\
+			);						\
+		_arg1;							\
+	})
+
 static inline bool feats_ok(struct tdescr *td)
 {
 	if (td->feats_incompatible & td->feats_supported)

From 3d37d4307e0fc958c4461bb6973ce5573d1570c2 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:13 +0100
Subject: [PATCH 033/168] kselftest/arm64: Add very basic GCS test program

This test program just covers the basic GCS ABI, covering aspects of the
ABI as standalone features without attempting to integrate things.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-34-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/Makefile        |   2 +-
 tools/testing/selftests/arm64/gcs/.gitignore  |   1 +
 tools/testing/selftests/arm64/gcs/Makefile    |  18 +
 tools/testing/selftests/arm64/gcs/basic-gcs.c | 357 ++++++++++++++++++
 tools/testing/selftests/arm64/gcs/gcs-util.h  |  90 +++++
 5 files changed, 467 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/.gitignore
 create mode 100644 tools/testing/selftests/arm64/gcs/Makefile
 create mode 100644 tools/testing/selftests/arm64/gcs/basic-gcs.c
 create mode 100644 tools/testing/selftests/arm64/gcs/gcs-util.h

diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index 28b93cab8c0d..22029e60eff3 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -4,7 +4,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),aarch64 arm64))
-ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi
+ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi gcs
 else
 ARM64_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
new file mode 100644
index 000000000000..0e5e695ecba5
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -0,0 +1 @@
+basic-gcs
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
new file mode 100644
index 000000000000..61a30f483429
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2023 ARM Limited
+#
+# In order to avoid interaction with the toolchain and dynamic linker the
+# portions of these tests that interact with the GCS are implemented using
+# nolibc.
+#
+
+TEST_GEN_PROGS := basic-gcs
+
+include ../../lib.mk
+
+$(OUTPUT)/basic-gcs: basic-gcs.c
+	$(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \
+		-static -include ../../../../include/nolibc/nolibc.h \
+		-I../../../../../usr/include \
+		-std=gnu99 -I../.. -g \
+		-ffreestanding -Wall $^ -o $@ -lgcc
diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c
new file mode 100644
index 000000000000..3fb9742342a3
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#include <limits.h>
+#include <stdbool.h>
+
+#include <linux/prctl.h>
+
+#include <sys/mman.h>
+#include <asm/mman.h>
+#include <linux/sched.h>
+
+#include "kselftest.h"
+#include "gcs-util.h"
+
+/* nolibc doesn't have sysconf(), just hard code the maximum */
+static size_t page_size = 65536;
+
+static  __attribute__((noinline)) void valid_gcs_function(void)
+{
+	/* Do something the compiler can't optimise out */
+	my_syscall1(__NR_prctl, PR_SVE_GET_VL);
+}
+
+static inline int gcs_set_status(unsigned long mode)
+{
+	bool enabling = mode & PR_SHADOW_STACK_ENABLE;
+	int ret;
+	unsigned long new_mode;
+
+	/*
+	 * The prctl takes 1 argument but we need to ensure that the
+	 * other 3 values passed in registers to the syscall are zero
+	 * since the kernel validates them.
+	 */
+	ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode,
+			  0, 0, 0);
+
+	if (ret == 0) {
+		ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+				  &new_mode, 0, 0, 0);
+		if (ret == 0) {
+			if (new_mode != mode) {
+				ksft_print_msg("Mode set to %lx not %lx\n",
+					       new_mode, mode);
+				ret = -EINVAL;
+			}
+		} else {
+			ksft_print_msg("Failed to validate mode: %d\n", ret);
+		}
+
+		if (enabling != chkfeat_gcs()) {
+			ksft_print_msg("%senabled by prctl but %senabled in CHKFEAT\n",
+				       enabling ? "" : "not ",
+				       chkfeat_gcs() ? "" : "not ");
+			ret = -EINVAL;
+		}
+	}
+
+	return ret;
+}
+
+/* Try to read the status */
+static bool read_status(void)
+{
+	unsigned long state;
+	int ret;
+
+	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+			  &state, 0, 0, 0);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read state: %d\n", ret);
+		return false;
+	}
+
+	return state & PR_SHADOW_STACK_ENABLE;
+}
+
+/* Just a straight enable */
+static bool base_enable(void)
+{
+	int ret;
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+	if (ret) {
+		ksft_print_msg("PR_SHADOW_STACK_ENABLE failed %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+/* Check we can read GCSPR_EL0 when GCS is enabled */
+static bool read_gcspr_el0(void)
+{
+	unsigned long *gcspr_el0;
+
+	ksft_print_msg("GET GCSPR\n");
+	gcspr_el0 = get_gcspr();
+	ksft_print_msg("GCSPR_EL0 is %p\n", gcspr_el0);
+
+	return true;
+}
+
+/* Also allow writes to stack */
+static bool enable_writeable(void)
+{
+	int ret;
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE);
+	if (ret) {
+		ksft_print_msg("PR_SHADOW_STACK_ENABLE writeable failed: %d\n", ret);
+		return false;
+	}
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+	if (ret) {
+		ksft_print_msg("failed to restore plain enable %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+/* Also allow writes to stack */
+static bool enable_push_pop(void)
+{
+	int ret;
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH);
+	if (ret) {
+		ksft_print_msg("PR_SHADOW_STACK_ENABLE with push failed: %d\n",
+			       ret);
+		return false;
+	}
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+	if (ret) {
+		ksft_print_msg("failed to restore plain enable %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+/* Enable GCS and allow everything */
+static bool enable_all(void)
+{
+	int ret;
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH |
+			     PR_SHADOW_STACK_WRITE);
+	if (ret) {
+		ksft_print_msg("PR_SHADOW_STACK_ENABLE with everything failed: %d\n",
+			       ret);
+		return false;
+	}
+
+	ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+	if (ret) {
+		ksft_print_msg("failed to restore plain enable %d\n", ret);
+		return false;
+	}
+
+	return true;
+}
+
+static bool enable_invalid(void)
+{
+	int ret = gcs_set_status(ULONG_MAX);
+	if (ret == 0) {
+		ksft_print_msg("GCS_SET_STATUS %lx succeeded\n", ULONG_MAX);
+		return false;
+	}
+
+	return true;
+}
+
+/* Map a GCS */
+static bool map_guarded_stack(void)
+{
+	int ret;
+	uint64_t *buf;
+	uint64_t expected_cap;
+	int elem;
+	bool pass = true;
+
+	buf = (void *)my_syscall3(__NR_map_shadow_stack, 0, page_size,
+				  SHADOW_STACK_SET_MARKER |
+				  SHADOW_STACK_SET_TOKEN);
+	if (buf == MAP_FAILED) {
+		ksft_print_msg("Failed to map %lu byte GCS: %d\n",
+			       page_size, errno);
+		return false;
+	}
+	ksft_print_msg("Mapped GCS at %p-%p\n", buf,
+		       (void *)((uint64_t)buf + page_size));
+
+	/* The top of the newly allocated region should be 0 */
+	elem = (page_size / sizeof(uint64_t)) - 1;
+	if (buf[elem]) {
+		ksft_print_msg("Last entry is 0x%llx not 0x0\n", buf[elem]);
+		pass = false;
+	}
+
+	/* Then a valid cap token */
+	elem--;
+	expected_cap = ((uint64_t)buf + page_size - 16);
+	expected_cap &= GCS_CAP_ADDR_MASK;
+	expected_cap |= GCS_CAP_VALID_TOKEN;
+	if (buf[elem] != expected_cap) {
+		ksft_print_msg("Cap entry is 0x%llx not 0x%llx\n",
+			       buf[elem], expected_cap);
+		pass = false;
+	}
+	ksft_print_msg("cap token is 0x%llx\n", buf[elem]);
+
+	/* The rest should be zeros */
+	for (elem = 0; elem < page_size / sizeof(uint64_t) - 2; elem++) {
+		if (!buf[elem])
+			continue;
+		ksft_print_msg("GCS slot %d is 0x%llx not 0x0\n",
+			       elem, buf[elem]);
+		pass = false;
+	}
+
+	ret = munmap(buf, page_size);
+	if (ret != 0) {
+		ksft_print_msg("Failed to unmap %ld byte GCS: %d\n",
+			       page_size, errno);
+		pass = false;
+	}
+
+	return pass;
+}
+
+/* A fork()ed process can run */
+static bool test_fork(void)
+{
+	unsigned long child_mode;
+	int ret, status;
+	pid_t pid;
+	bool pass = true;
+
+	pid = fork();
+	if (pid == -1) {
+		ksft_print_msg("fork() failed: %d\n", errno);
+		pass = false;
+		goto out;
+	}
+	if (pid == 0) {
+		/* In child, make sure we can call a function, read
+		 * the GCS pointer and status and then exit */
+		valid_gcs_function();
+		get_gcspr();
+
+		ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+				  &child_mode, 0, 0, 0);
+		if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) {
+			ksft_print_msg("GCS not enabled in child\n");
+			ret = -EINVAL;
+		}
+
+		exit(ret);
+	}
+
+	/*
+	 * In parent, check we can still do function calls then block
+	 * for the child.
+	 */
+	valid_gcs_function();
+
+	ksft_print_msg("Waiting for child %d\n", pid);
+
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		ksft_print_msg("Failed to wait for child: %d\n",
+			       errno);
+		return false;
+	}
+
+	if (!WIFEXITED(status)) {
+		ksft_print_msg("Child exited due to signal %d\n",
+			       WTERMSIG(status));
+		pass = false;
+	} else {
+		if (WEXITSTATUS(status)) {
+			ksft_print_msg("Child exited with status %d\n",
+				       WEXITSTATUS(status));
+			pass = false;
+		}
+	}
+
+out:
+
+	return pass;
+}
+
+typedef bool (*gcs_test)(void);
+
+static struct {
+	char *name;
+	gcs_test test;
+	bool needs_enable;
+} tests[] = {
+	{ "read_status", read_status },
+	{ "base_enable", base_enable, true },
+	{ "read_gcspr_el0", read_gcspr_el0 },
+	{ "enable_writeable", enable_writeable, true },
+	{ "enable_push_pop", enable_push_pop, true },
+	{ "enable_all", enable_all, true },
+	{ "enable_invalid", enable_invalid, true },
+	{ "map_guarded_stack", map_guarded_stack },
+	{ "fork", test_fork },
+};
+
+int main(void)
+{
+	int i, ret;
+	unsigned long gcs_mode;
+
+	ksft_print_header();
+
+	/*
+	 * We don't have getauxval() with nolibc so treat a failure to
+	 * read GCS state as a lack of support and skip.
+	 */
+	ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+			  &gcs_mode, 0, 0, 0);
+	if (ret != 0)
+		ksft_exit_skip("Failed to read GCS state: %d\n", ret);
+
+	if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) {
+		gcs_mode = PR_SHADOW_STACK_ENABLE;
+		ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+				  gcs_mode, 0, 0, 0);
+		if (ret != 0)
+			ksft_exit_fail_msg("Failed to enable GCS: %d\n", ret);
+	}
+
+	ksft_set_plan(ARRAY_SIZE(tests));
+
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		ksft_test_result((*tests[i].test)(), "%s\n", tests[i].name);
+	}
+
+	/* One last test: disable GCS, we can do this one time */
+	my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0);
+	if (ret != 0)
+		ksft_print_msg("Failed to disable GCS: %d\n", ret);
+
+	ksft_finished();
+
+	return 0;
+}
diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h b/tools/testing/selftests/arm64/gcs/gcs-util.h
new file mode 100644
index 000000000000..1ae6864d3f86
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-util.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#ifndef GCS_UTIL_H
+#define GCS_UTIL_H
+
+#include <stdbool.h>
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+#ifndef __NR_prctl
+#define __NR_prctl 167
+#endif
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS	74
+#define PR_SET_SHADOW_STACK_STATUS      75
+#define PR_LOCK_SHADOW_STACK_STATUS     76
+
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+#define PR_SHADOW_STACK_ALL_MODES \
+	PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH
+
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0)     /* Set up a restore token in the shadow stack */
+#define SHADOW_STACK_SET_MARKER (1ULL << 1)     /* Set up a top of stack merker in the shadow stack */
+
+#define GCS_CAP_ADDR_MASK		(0xfffffffffffff000UL)
+#define GCS_CAP_TOKEN_MASK		(0x0000000000000fffUL)
+#define GCS_CAP_VALID_TOKEN		1
+#define GCS_CAP_IN_PROGRESS_TOKEN	5
+
+#define GCS_CAP(x) (((unsigned long)(x) & GCS_CAP_ADDR_MASK) | \
+		    GCS_CAP_VALID_TOKEN)
+
+static inline unsigned long *get_gcspr(void)
+{
+	unsigned long *gcspr;
+
+	asm volatile(
+		"mrs	%0, S3_3_C2_C5_1"
+	: "=r" (gcspr)
+	:
+	: "cc");
+
+	return gcspr;
+}
+
+static inline void __attribute__((always_inline)) gcsss1(unsigned long *Xt)
+{
+	asm volatile (
+		"sys #3, C7, C7, #2, %0\n"
+		:
+		: "rZ" (Xt)
+		: "memory");
+}
+
+static inline unsigned long __attribute__((always_inline)) *gcsss2(void)
+{
+	unsigned long *Xt;
+
+	asm volatile(
+		"SYSL %0, #3, C7, C7, #3\n"
+		: "=r" (Xt)
+		:
+		: "memory");
+
+	return Xt;
+}
+
+static inline bool chkfeat_gcs(void)
+{
+	register long val __asm__ ("x16") = 1;
+
+	/* CHKFEAT x16 */
+	asm volatile(
+		"hint #0x28\n"
+		: "=r" (val)
+		: "r" (val));
+
+	return val != 1;
+}
+
+#endif

From a505a52b4e292f5e031a01eb3d4e203eb18acb7d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:14 +0100
Subject: [PATCH 034/168] kselftest/arm64: Add a GCS test program built with
 the system libc

There are things like threads which nolibc struggles with which we want
to add coverage for, and the ABI allows us to test most of these even if
libc itself does not understand GCS so add a test application built
using the system libc.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-35-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/.gitignore |   1 +
 tools/testing/selftests/arm64/gcs/Makefile   |   4 +-
 tools/testing/selftests/arm64/gcs/gcs-util.h |  10 +
 tools/testing/selftests/arm64/gcs/libc-gcs.c | 728 +++++++++++++++++++
 4 files changed, 742 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/libc-gcs.c

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
index 0e5e695ecba5..5810c4a163d4 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1 +1,2 @@
 basic-gcs
+libc-gcs
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index 61a30f483429..a8fdf21e9a47 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,9 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs
+TEST_GEN_PROGS := basic-gcs libc-gcs
+
+LDLIBS+=-lpthread
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h b/tools/testing/selftests/arm64/gcs/gcs-util.h
index 1ae6864d3f86..c99a6b39ac14 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-util.h
+++ b/tools/testing/selftests/arm64/gcs/gcs-util.h
@@ -16,6 +16,16 @@
 #define __NR_prctl 167
 #endif
 
+#ifndef NT_ARM_GCS
+#define NT_ARM_GCS 0x410
+
+struct user_gcs {
+	__u64 features_enabled;
+	__u64 features_locked;
+	__u64 gcspr_el0;
+};
+#endif
+
 /* Shadow Stack/Guarded Control Stack interface */
 #define PR_GET_SHADOW_STACK_STATUS	74
 #define PR_SET_SHADOW_STACK_STATUS      75
diff --git a/tools/testing/selftests/arm64/gcs/libc-gcs.c b/tools/testing/selftests/arm64/gcs/libc-gcs.c
new file mode 100644
index 000000000000..17b2fabfec38
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/libc-gcs.c
@@ -0,0 +1,728 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#define _GNU_SOURCE
+
+#include <pthread.h>
+#include <stdbool.h>
+
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/uio.h>
+
+#include <asm/hwcap.h>
+#include <asm/mman.h>
+
+#include <linux/compiler.h>
+
+#include "kselftest_harness.h"
+
+#include "gcs-util.h"
+
+#define my_syscall2(num, arg1, arg2)                                          \
+({                                                                            \
+	register long _num  __asm__ ("x8") = (num);                           \
+	register long _arg1 __asm__ ("x0") = (long)(arg1);                    \
+	register long _arg2 __asm__ ("x1") = (long)(arg2);                    \
+	register long _arg3 __asm__ ("x2") = 0;                               \
+	register long _arg4 __asm__ ("x3") = 0;                               \
+	register long _arg5 __asm__ ("x4") = 0;                               \
+	                                                                      \
+	__asm__  volatile (                                                   \
+		"svc #0\n"                                                    \
+		: "=r"(_arg1)                                                 \
+		: "r"(_arg1), "r"(_arg2),                                     \
+		  "r"(_arg3), "r"(_arg4),                                     \
+		  "r"(_arg5), "r"(_num)					      \
+		: "memory", "cc"                                              \
+	);                                                                    \
+	_arg1;                                                                \
+})
+
+static noinline void gcs_recurse(int depth)
+{
+	if (depth)
+		gcs_recurse(depth - 1);
+
+	/* Prevent tail call optimization so we actually recurse */
+	asm volatile("dsb sy" : : : "memory");
+}
+
+/* Smoke test that a function call and return works*/
+TEST(can_call_function)
+{
+	gcs_recurse(0);
+}
+
+static void *gcs_test_thread(void *arg)
+{
+	int ret;
+	unsigned long mode;
+
+	/*
+	 * Some libcs don't seem to fill unused arguments with 0 but
+	 * the kernel validates this so we supply all 5 arguments.
+	 */
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	if (ret != 0) {
+		ksft_print_msg("PR_GET_SHADOW_STACK_STATUS failed: %d\n", ret);
+		return NULL;
+	}
+
+	if (!(mode & PR_SHADOW_STACK_ENABLE)) {
+		ksft_print_msg("GCS not enabled in thread, mode is %lu\n",
+			       mode);
+		return NULL;
+	}
+
+	/* Just in case... */
+	gcs_recurse(0);
+
+	/* Use a non-NULL value to indicate a pass */
+	return &gcs_test_thread;
+}
+
+/* Verify that if we start a new thread it has GCS enabled */
+TEST(gcs_enabled_thread)
+{
+	pthread_t thread;
+	void *thread_ret;
+	int ret;
+
+	ret = pthread_create(&thread, NULL, gcs_test_thread, NULL);
+	ASSERT_TRUE(ret == 0);
+	if (ret != 0)
+		return;
+
+	ret = pthread_join(thread, &thread_ret);
+	ASSERT_TRUE(ret == 0);
+	if (ret != 0)
+		return;
+
+	ASSERT_TRUE(thread_ret != NULL);
+}
+
+/* Read the GCS until we find the terminator */
+TEST(gcs_find_terminator)
+{
+	unsigned long *gcs, *cur;
+
+	gcs = get_gcspr();
+	cur = gcs;
+	while (*cur)
+		cur++;
+
+	ksft_print_msg("GCS in use from %p-%p\n", gcs, cur);
+
+	/*
+	 * We should have at least whatever called into this test so
+	 * the two pointer should differ.
+	 */
+	ASSERT_TRUE(gcs != cur);
+}
+
+/*
+ * We can access a GCS via ptrace
+ *
+ * This could usefully have a fixture but note that each test is
+ * fork()ed into a new child whcih causes issues.  Might be better to
+ * lift at least some of this out into a separate, non-harness, test
+ * program.
+ */
+TEST(ptrace_read_write)
+{
+	pid_t child, pid;
+	int ret, status;
+	siginfo_t si;
+	uint64_t val, rval, gcspr;
+	struct user_gcs child_gcs;
+	struct iovec iov, local_iov, remote_iov;
+
+	child = fork();
+	if (child == -1) {
+		ksft_print_msg("fork() failed: %d (%s)\n",
+			       errno, strerror(errno));
+		ASSERT_NE(child, -1);
+	}
+
+	if (child == 0) {
+		/*
+		 * In child, make sure there's something on the stack and
+		 * ask to be traced.
+		 */
+		gcs_recurse(0);
+		if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
+			ksft_exit_fail_msg("PTRACE_TRACEME %s",
+					   strerror(errno));
+
+		if (raise(SIGSTOP))
+			ksft_exit_fail_msg("raise(SIGSTOP) %s",
+					   strerror(errno));
+
+		return;
+	}
+
+	ksft_print_msg("Child: %d\n", child);
+
+	/* Attach to the child */
+	while (1) {
+		int sig;
+
+		pid = wait(&status);
+		if (pid == -1) {
+			ksft_print_msg("wait() failed: %s",
+				       strerror(errno));
+			goto error;
+		}
+
+		/*
+		 * This should never happen but it's hard to flag in
+		 * the framework.
+		 */
+		if (pid != child)
+			continue;
+
+		if (WIFEXITED(status) || WIFSIGNALED(status))
+			ksft_exit_fail_msg("Child died unexpectedly\n");
+
+		if (!WIFSTOPPED(status))
+			goto error;
+
+		sig = WSTOPSIG(status);
+
+		if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)) {
+			if (errno == ESRCH) {
+				ASSERT_NE(errno, ESRCH);
+				return;
+			}
+
+			if (errno == EINVAL) {
+				sig = 0; /* bust group-stop */
+				goto cont;
+			}
+
+			ksft_print_msg("PTRACE_GETSIGINFO: %s\n",
+				       strerror(errno));
+			goto error;
+		}
+
+		if (sig == SIGSTOP && si.si_code == SI_TKILL &&
+		    si.si_pid == pid)
+			break;
+
+	cont:
+		if (ptrace(PTRACE_CONT, pid, NULL, sig)) {
+			if (errno == ESRCH) {
+				ASSERT_NE(errno, ESRCH);
+				return;
+			}
+
+			ksft_print_msg("PTRACE_CONT: %s\n", strerror(errno));
+			goto error;
+		}
+	}
+
+	/* Where is the child GCS? */
+	iov.iov_base = &child_gcs;
+	iov.iov_len = sizeof(child_gcs);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_GCS, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read child GCS state: %s (%d)\n",
+			       strerror(errno), errno);
+		goto error;
+	}
+
+	/* We should have inherited GCS over fork(), confirm */
+	if (!(child_gcs.features_enabled & PR_SHADOW_STACK_ENABLE)) {
+		ASSERT_TRUE(child_gcs.features_enabled &
+			    PR_SHADOW_STACK_ENABLE);
+		goto error;
+	}
+
+	gcspr = child_gcs.gcspr_el0;
+	ksft_print_msg("Child GCSPR 0x%lx, flags %llx, locked %llx\n",
+		       gcspr, child_gcs.features_enabled,
+		       child_gcs.features_locked);
+
+	/* Ideally we'd cross check with the child memory map */
+
+	errno = 0;
+	val = ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL);
+	ret = errno;
+	if (ret != 0)
+		ksft_print_msg("PTRACE_PEEKDATA failed: %s (%d)\n",
+			       strerror(ret), ret);
+	EXPECT_EQ(ret, 0);
+
+	/* The child should be in a function, the GCSPR shouldn't be 0 */
+	EXPECT_NE(val, 0);
+
+	/* Same thing via process_vm_readv() */
+	local_iov.iov_base = &rval;
+	local_iov.iov_len = sizeof(rval);
+	remote_iov.iov_base = (void *)gcspr;
+	remote_iov.iov_len = sizeof(rval);
+	ret = process_vm_readv(child, &local_iov, 1, &remote_iov, 1, 0);
+	if (ret == -1)
+		ksft_print_msg("process_vm_readv() failed: %s (%d)\n",
+			       strerror(errno), errno);
+	EXPECT_EQ(ret, sizeof(rval));
+	EXPECT_EQ(val, rval);
+
+	/* Write data via a peek */
+	ret = ptrace(PTRACE_POKEDATA, child, (void *)gcspr, NULL);
+	if (ret == -1)
+		ksft_print_msg("PTRACE_POKEDATA failed: %s (%d)\n",
+			       strerror(errno), errno);
+	EXPECT_EQ(ret, 0);
+	EXPECT_EQ(0, ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL));
+
+	/* Restore what we had before */
+	ret = ptrace(PTRACE_POKEDATA, child, (void *)gcspr, val);
+	if (ret == -1)
+		ksft_print_msg("PTRACE_POKEDATA failed: %s (%d)\n",
+			       strerror(errno), errno);
+	EXPECT_EQ(ret, 0);
+	EXPECT_EQ(val, ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL));
+
+	/* That's all, folks */
+	kill(child, SIGKILL);
+	return;
+
+error:
+	kill(child, SIGKILL);
+	ASSERT_FALSE(true);
+}
+
+FIXTURE(map_gcs)
+{
+	unsigned long *stack;
+};
+
+FIXTURE_VARIANT(map_gcs)
+{
+	size_t stack_size;
+	unsigned long flags;
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_cap_marker)
+{
+	.stack_size = 2 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_cap)
+{
+	.stack_size = 2 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_marker)
+{
+	.stack_size = 2 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k)
+{
+	.stack_size = 2 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k_cap_marker)
+{
+	.stack_size = 4 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k_cap)
+{
+	.stack_size = 4 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s3k_marker)
+{
+	.stack_size = 4 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k)
+{
+	.stack_size = 4 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_cap_marker)
+{
+	.stack_size = 16 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_cap)
+{
+	.stack_size = 16 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_marker)
+{
+	.stack_size = 16 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k)
+{
+	.stack_size = 16 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_cap_marker)
+{
+	.stack_size = 64 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_cap)
+{
+	.stack_size = 64 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_marker)
+{
+	.stack_size = 64 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k)
+{
+	.stack_size = 64 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_cap_marker)
+{
+	.stack_size = 128 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_cap)
+{
+	.stack_size = 128 * 1024,
+	.flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_marker)
+{
+	.stack_size = 128 * 1024,
+	.flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k)
+{
+	.stack_size = 128 * 1024,
+	.flags = 0,
+};
+
+FIXTURE_SETUP(map_gcs)
+{
+	self->stack = (void *)syscall(__NR_map_shadow_stack, 0,
+				      variant->stack_size, 
+				      variant->flags);
+	ASSERT_FALSE(self->stack == MAP_FAILED);
+	ksft_print_msg("Allocated stack from %p-%p\n", self->stack,
+		       self->stack + variant->stack_size);
+}
+
+FIXTURE_TEARDOWN(map_gcs)
+{
+	int ret;
+
+	if (self->stack != MAP_FAILED) {
+		ret = munmap(self->stack, variant->stack_size);
+		ASSERT_EQ(ret, 0);
+	}
+}
+
+/* The stack has a cap token */
+TEST_F(map_gcs, stack_capped)
+{
+	unsigned long *stack = self->stack;
+	size_t cap_index;
+
+	cap_index = (variant->stack_size / sizeof(unsigned long));
+
+	switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+	case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+		cap_index -= 2;
+		break;
+	case SHADOW_STACK_SET_TOKEN:
+		cap_index -= 1;
+		break;
+	case SHADOW_STACK_SET_MARKER:
+	case 0:
+		/* No cap, no test */
+		return;
+	}
+
+	ASSERT_EQ(stack[cap_index], GCS_CAP(&stack[cap_index]));
+}
+
+/* The top of the stack is 0 */
+TEST_F(map_gcs, stack_terminated)
+{
+	unsigned long *stack = self->stack;
+	size_t term_index;
+
+	if (!(variant->flags & SHADOW_STACK_SET_MARKER))
+		return;
+
+	term_index = (variant->stack_size / sizeof(unsigned long)) - 1;
+
+	ASSERT_EQ(stack[term_index], 0);
+}
+
+/* Writes should fault */
+TEST_F_SIGNAL(map_gcs, not_writeable, SIGSEGV)
+{
+	self->stack[0] = 0;
+}
+
+/* Put it all together, we can safely switch to and from the stack */
+TEST_F(map_gcs, stack_switch)
+{
+	size_t cap_index;
+	cap_index = (variant->stack_size / sizeof(unsigned long));
+	unsigned long *orig_gcspr_el0, *pivot_gcspr_el0;
+
+	/* Skip over the stack terminator and point at the cap */
+	switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+	case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+		cap_index -= 2;
+		break;
+	case SHADOW_STACK_SET_TOKEN:
+		cap_index -= 1;
+		break;
+	case SHADOW_STACK_SET_MARKER:
+	case 0:
+		/* No cap, no test */
+		return;
+	}
+	pivot_gcspr_el0 = &self->stack[cap_index];
+
+	/* Pivot to the new GCS */
+	ksft_print_msg("Pivoting to %p from %p, target has value 0x%lx\n",
+		       pivot_gcspr_el0, get_gcspr(),
+		       *pivot_gcspr_el0);
+	gcsss1(pivot_gcspr_el0);
+	orig_gcspr_el0 = gcsss2();
+	ksft_print_msg("Pivoted to %p from %p, target has value 0x%lx\n",
+		       get_gcspr(), orig_gcspr_el0,
+		       *pivot_gcspr_el0);
+
+	ksft_print_msg("Pivoted, GCSPR_EL0 now %p\n", get_gcspr());
+
+	/* New GCS must be in the new buffer */
+	ASSERT_TRUE((unsigned long)get_gcspr() > (unsigned long)self->stack);
+	ASSERT_TRUE((unsigned long)get_gcspr() <=
+		    (unsigned long)self->stack + variant->stack_size);
+
+	/* We should be able to use all but 2 slots of the new stack */
+	ksft_print_msg("Recursing %zu levels\n", cap_index - 1);
+	gcs_recurse(cap_index - 1);
+
+	/* Pivot back to the original GCS */
+	gcsss1(orig_gcspr_el0);
+	pivot_gcspr_el0 = gcsss2();
+
+	gcs_recurse(0);
+	ksft_print_msg("Pivoted back to GCSPR_EL0 0x%p\n", get_gcspr());
+}
+
+/* We fault if we try to go beyond the end of the stack */
+TEST_F_SIGNAL(map_gcs, stack_overflow, SIGSEGV)
+{
+	size_t cap_index;
+	cap_index = (variant->stack_size / sizeof(unsigned long));
+	unsigned long *orig_gcspr_el0, *pivot_gcspr_el0;
+
+	/* Skip over the stack terminator and point at the cap */
+	switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+	case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+		cap_index -= 2;
+		break;
+	case SHADOW_STACK_SET_TOKEN:
+		cap_index -= 1;
+		break;
+	case SHADOW_STACK_SET_MARKER:
+	case 0:
+		/* No cap, no test but we need to SEGV to avoid a false fail */
+		orig_gcspr_el0 = get_gcspr();
+		*orig_gcspr_el0 = 0;
+		return;
+	}
+	pivot_gcspr_el0 = &self->stack[cap_index];
+
+	/* Pivot to the new GCS */
+	ksft_print_msg("Pivoting to %p from %p, target has value 0x%lx\n",
+		       pivot_gcspr_el0, get_gcspr(),
+		       *pivot_gcspr_el0);
+	gcsss1(pivot_gcspr_el0);
+	orig_gcspr_el0 = gcsss2();
+	ksft_print_msg("Pivoted to %p from %p, target has value 0x%lx\n",
+		       pivot_gcspr_el0, orig_gcspr_el0,
+		       *pivot_gcspr_el0);
+
+	ksft_print_msg("Pivoted, GCSPR_EL0 now %p\n", get_gcspr());
+
+	/* New GCS must be in the new buffer */
+	ASSERT_TRUE((unsigned long)get_gcspr() > (unsigned long)self->stack);
+	ASSERT_TRUE((unsigned long)get_gcspr() <=
+		    (unsigned long)self->stack + variant->stack_size);
+
+	/* Now try to recurse, we should fault doing this. */
+	ksft_print_msg("Recursing %zu levels...\n", cap_index + 1);
+	gcs_recurse(cap_index + 1);
+	ksft_print_msg("...done\n");
+
+	/* Clean up properly to try to guard against spurious passes. */
+	gcsss1(orig_gcspr_el0);
+	pivot_gcspr_el0 = gcsss2();
+	ksft_print_msg("Pivoted back to GCSPR_EL0 0x%p\n", get_gcspr());
+}
+
+FIXTURE(map_invalid_gcs)
+{
+};
+
+FIXTURE_VARIANT(map_invalid_gcs)
+{
+	size_t stack_size;
+};
+
+FIXTURE_SETUP(map_invalid_gcs)
+{
+}
+
+FIXTURE_TEARDOWN(map_invalid_gcs)
+{
+}
+
+/* GCS must be larger than 16 bytes */
+FIXTURE_VARIANT_ADD(map_invalid_gcs, too_small)
+{
+	.stack_size = 8,
+};
+
+/* GCS size must be 16 byte aligned */
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_1)  { .stack_size = 1024 + 1  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_2)  { .stack_size = 1024 + 2  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_3)  { .stack_size = 1024 + 3  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_4)  { .stack_size = 1024 + 4  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_5)  { .stack_size = 1024 + 5  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_6)  { .stack_size = 1024 + 6  };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_7)  { .stack_size = 1024 + 7  };
+
+TEST_F(map_invalid_gcs, do_map)
+{
+	void *stack;
+
+	stack = (void *)syscall(__NR_map_shadow_stack, 0,
+				variant->stack_size, 0);
+	ASSERT_TRUE(stack == MAP_FAILED);
+	if (stack != MAP_FAILED)
+		munmap(stack, variant->stack_size);
+}
+
+FIXTURE(invalid_mprotect)
+{
+	unsigned long *stack;
+	size_t stack_size;
+};
+
+FIXTURE_VARIANT(invalid_mprotect)
+{
+	unsigned long flags;
+};
+
+FIXTURE_SETUP(invalid_mprotect)
+{
+	self->stack_size = sysconf(_SC_PAGE_SIZE);
+	self->stack = (void *)syscall(__NR_map_shadow_stack, 0,
+				      self->stack_size, 0);
+	ASSERT_FALSE(self->stack == MAP_FAILED);
+	ksft_print_msg("Allocated stack from %p-%p\n", self->stack,
+		       self->stack + self->stack_size);
+}
+
+FIXTURE_TEARDOWN(invalid_mprotect)
+{
+	int ret;
+
+	if (self->stack != MAP_FAILED) {
+		ret = munmap(self->stack, self->stack_size);
+		ASSERT_EQ(ret, 0);
+	}
+}
+
+FIXTURE_VARIANT_ADD(invalid_mprotect, exec)
+{
+	.flags = PROT_EXEC,
+};
+
+TEST_F(invalid_mprotect, do_map)
+{
+	int ret;
+
+	ret = mprotect(self->stack, self->stack_size, variant->flags);
+	ASSERT_EQ(ret, -1);
+}
+
+TEST_F(invalid_mprotect, do_map_read)
+{
+	int ret;
+
+	ret = mprotect(self->stack, self->stack_size,
+		       variant->flags | PROT_READ);
+	ASSERT_EQ(ret, -1);
+}
+
+int main(int argc, char **argv)
+{
+	unsigned long gcs_mode;
+	int ret;
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
+		ksft_exit_skip("SKIP GCS not supported\n");
+
+	/* 
+	 * Force shadow stacks on, our tests *should* be fine with or
+	 * without libc support and with or without this having ended
+	 * up tagged for GCS and enabled by the dynamic linker.  We
+	 * can't use the libc prctl() function since we can't return
+	 * from enabling the stack.
+	 */
+	ret = my_syscall2(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode);
+	if (ret) {
+		ksft_print_msg("Failed to read GCS state: %d\n", ret);
+		return EXIT_FAILURE;
+	}
+	
+	if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) {
+		gcs_mode = PR_SHADOW_STACK_ENABLE;
+		ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+				  gcs_mode);
+		if (ret) {
+			ksft_print_msg("Failed to configure GCS: %d\n", ret);
+			return EXIT_FAILURE;
+		}
+	}
+
+	/* Avoid returning in case libc doesn't understand GCS */
+	exit(test_harness_run(argc, argv));
+}

From 58d69a3e35825698b5daddc1a074e9ea19cb0c51 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:15 +0100
Subject: [PATCH 035/168] kselftest/arm64: Add test coverage for GCS mode
 locking

Verify that we can lock individual GCS mode bits, that other modes
aren't affected and as a side effect also that every combination of
modes can be enabled.

Normally the inability to reenable GCS after disabling it would be an
issue with testing but fortunately the kselftest_harness runs each test
within a fork()ed child.  This can be inconvenient for some kinds of
testing but here it means that each test is in a separate thread and
therefore won't be affected by other tests in the suite.

Once we get toolchains with support for enabling GCS by default we will
need to take care to not do that in the build system but there are no
such toolchains yet so it is not yet an issue.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-36-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/.gitignore  |   1 +
 tools/testing/selftests/arm64/gcs/Makefile    |   2 +-
 .../testing/selftests/arm64/gcs/gcs-locking.c | 200 ++++++++++++++++++
 3 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/gcs-locking.c

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
index 5810c4a163d4..0c86f53f68ad 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1,2 +1,3 @@
 basic-gcs
 libc-gcs
+gcs-locking
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index a8fdf21e9a47..2173d6275956 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,7 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs libc-gcs
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking
 
 LDLIBS+=-lpthread
 
diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c
new file mode 100644
index 000000000000..989f75a491b7
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ *
+ * Tests for GCS mode locking.  These tests rely on both having GCS
+ * unconfigured on entry and on the kselftest harness running each
+ * test in a fork()ed process which will have it's own mode.
+ */
+
+#include <limits.h>
+
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+
+#include <asm/hwcap.h>
+
+#include "kselftest_harness.h"
+
+#include "gcs-util.h"
+
+#define my_syscall2(num, arg1, arg2)                                          \
+({                                                                            \
+	register long _num  __asm__ ("x8") = (num);                           \
+	register long _arg1 __asm__ ("x0") = (long)(arg1);                    \
+	register long _arg2 __asm__ ("x1") = (long)(arg2);                    \
+	register long _arg3 __asm__ ("x2") = 0;                               \
+	register long _arg4 __asm__ ("x3") = 0;                               \
+	register long _arg5 __asm__ ("x4") = 0;                               \
+	                                                                      \
+	__asm__  volatile (                                                   \
+		"svc #0\n"                                                    \
+		: "=r"(_arg1)                                                 \
+		: "r"(_arg1), "r"(_arg2),                                     \
+		  "r"(_arg3), "r"(_arg4),                                     \
+		  "r"(_arg5), "r"(_num)					      \
+		: "memory", "cc"                                              \
+	);                                                                    \
+	_arg1;                                                                \
+})
+
+/* No mode bits are rejected for locking */
+TEST(lock_all_modes)
+{
+	int ret;
+
+	ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, ULONG_MAX, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+}
+
+FIXTURE(valid_modes)
+{
+};
+
+FIXTURE_VARIANT(valid_modes)
+{
+	unsigned long mode;
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable)
+{
+	.mode = PR_SHADOW_STACK_ENABLE,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_write)
+{
+	.mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_push)
+{
+	.mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_write_push)
+{
+	.mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE |
+		PR_SHADOW_STACK_PUSH,
+};
+
+FIXTURE_SETUP(valid_modes)
+{
+}
+
+FIXTURE_TEARDOWN(valid_modes)
+{
+}
+
+/* We can set the mode at all */
+TEST_F(valid_modes, set)
+{
+	int ret;
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, 0);
+
+	_exit(0);
+}
+
+/* Enabling, locking then disabling is rejected */
+TEST_F(valid_modes, enable_lock_disable)
+{
+	unsigned long mode;
+	int ret;
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, 0);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, variant->mode);
+
+	ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0);
+	ASSERT_EQ(ret, -EBUSY);
+
+	_exit(0);
+}
+
+/* Locking then enabling is rejected */
+TEST_F(valid_modes, lock_enable)
+{
+	unsigned long mode;
+	int ret;
+
+	ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, -EBUSY);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, 0);
+
+	_exit(0);
+}
+
+/* Locking then changing other modes is fine */
+TEST_F(valid_modes, lock_enable_disable_others)
+{
+	unsigned long mode;
+	int ret;
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, 0);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, variant->mode);
+
+	ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  PR_SHADOW_STACK_ALL_MODES);
+	ASSERT_EQ(ret, 0);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES);
+
+
+	ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+			  variant->mode);
+	ASSERT_EQ(ret, 0);
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	ASSERT_EQ(ret, 0);
+	ASSERT_EQ(mode, variant->mode);
+
+	_exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	unsigned long mode;
+	int ret;
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
+		ksft_exit_skip("SKIP GCS not supported\n");
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+	if (ret) {
+		ksft_print_msg("Failed to read GCS state: %d\n", ret);
+		return EXIT_FAILURE;
+	}
+
+	if (mode & PR_SHADOW_STACK_ENABLE) {
+		ksft_print_msg("GCS was enabled, test unsupported\n");
+		return KSFT_SKIP;
+	}
+
+	return test_harness_run(argc, argv);
+}

From 794b64ca5665323f36e5fc92dfca02a3797b6523 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:16 +0100
Subject: [PATCH 036/168] kselftest/arm64: Add GCS signal tests

Do some testing of the signal handling for GCS, checking that a GCS
frame has the expected information in it and that the expected signals
are delivered with invalid operations.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-37-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/signal/.gitignore |  1 +
 .../arm64/signal/test_signals_utils.h         | 10 +++
 .../signal/testcases/gcs_exception_fault.c    | 62 +++++++++++++
 .../arm64/signal/testcases/gcs_frame.c        | 88 +++++++++++++++++++
 .../arm64/signal/testcases/gcs_write_fault.c  | 67 ++++++++++++++
 5 files changed, 228 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c

diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
index b2f2bfd5c6aa..b257db665a35 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -3,6 +3,7 @@ mangle_*
 fake_sigreturn_*
 fpmr_*
 poe_*
+gcs_*
 sme_*
 ssve_*
 sve_*
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h
index 1e80808ee105..36fc12b3cd60 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.h
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -6,6 +6,7 @@
 
 #include <assert.h>
 #include <stdio.h>
+#include <stdint.h>
 #include <string.h>
 
 #include <linux/compiler.h>
@@ -47,6 +48,15 @@ void test_result(struct tdescr *td);
 		_arg1;							\
 	})
 
+static inline __attribute__((always_inline)) uint64_t get_gcspr_el0(void)
+{
+	uint64_t val;
+
+	asm volatile("mrs %0, S3_3_C2_C5_1" : "=r" (val));
+
+	return val;
+}
+
 static inline bool feats_ok(struct tdescr *td)
 {
 	if (td->feats_incompatible & td->feats_supported)
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
new file mode 100644
index 000000000000..6228448b2ae7
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+/*
+ * We should get this from asm/siginfo.h but the testsuite is being
+ * clever with redefining siginfo_t.
+ */
+#ifndef SEGV_CPERR
+#define SEGV_CPERR 10
+#endif
+
+static inline void gcsss1(uint64_t Xt)
+{
+	asm volatile (
+		"sys #3, C7, C7, #2, %0\n"
+		:
+		: "rZ" (Xt)
+		: "memory");
+}
+
+static int gcs_op_fault_trigger(struct tdescr *td)
+{
+	/*
+	 * The slot below our current GCS should be in a valid GCS but
+	 * must not have a valid cap in it.
+	 */
+	gcsss1(get_gcspr_el0() - 8);
+
+	return 0;
+}
+
+static int gcs_op_fault_signal(struct tdescr *td, siginfo_t *si,
+				  ucontext_t *uc)
+{
+	ASSERT_GOOD_CONTEXT(uc);
+
+	return 1;
+}
+
+struct tdescr tde = {
+	.name = "Invalid GCS operation",
+	.descr = "An invalid GCS operation generates the expected signal",
+	.feats_required = FEAT_GCS,
+	.timeout = 3,
+	.sig_ok = SIGSEGV,
+	.sig_ok_code = SEGV_CPERR,
+	.sanity_disabled = true,
+	.trigger = gcs_op_fault_trigger,
+	.run = gcs_op_fault_signal,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
new file mode 100644
index 000000000000..b405d82321da
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+	ucontext_t uc;
+	char buf[1024 * 64];
+} context;
+
+static int gcs_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+	size_t offset;
+	struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+	struct gcs_context *gcs;
+	unsigned long expected, gcspr;
+	uint64_t *u64_val;
+	int ret;
+
+	ret = prctl(PR_GET_SHADOW_STACK_STATUS, &expected, 0, 0, 0);
+	if (ret != 0) {
+		fprintf(stderr, "Unable to query GCS status\n");
+		return 1;
+	}
+
+	/* We expect a cap to be added to the GCS in the signal frame */
+	gcspr = get_gcspr_el0();
+	gcspr -= 8;
+	fprintf(stderr, "Expecting GCSPR_EL0 %lx\n", gcspr);
+
+	if (!get_current_context(td, &context.uc, sizeof(context))) {
+		fprintf(stderr, "Failed getting context\n");
+		return 1;
+	}
+
+	/* Ensure that the signal restore token was consumed */
+	u64_val = (uint64_t *)get_gcspr_el0() + 1;
+	if (*u64_val) {
+		fprintf(stderr, "GCS value at %p is %lx not 0\n",
+			u64_val, *u64_val);
+		return 1;
+	}
+
+	fprintf(stderr, "Got context\n");
+
+	head = get_header(head, GCS_MAGIC, GET_BUF_RESV_SIZE(context),
+			  &offset);
+	if (!head) {
+		fprintf(stderr, "No GCS context\n");
+		return 1;
+	}
+
+	gcs = (struct gcs_context *)head;
+
+	/* Basic size validation is done in get_current_context() */
+
+	if (gcs->features_enabled != expected) {
+		fprintf(stderr, "Features enabled %llx but expected %lx\n",
+			gcs->features_enabled, expected);
+		return 1;
+	}
+
+	if (gcs->gcspr != gcspr) {
+		fprintf(stderr, "Got GCSPR %llx but expected %lx\n",
+			gcs->gcspr, gcspr);
+		return 1;
+	}
+
+	fprintf(stderr, "GCS context validated\n");
+	td->pass = 1;
+
+	return 0;
+}
+
+struct tdescr tde = {
+	.name = "GCS basics",
+	.descr = "Validate a GCS signal context",
+	.feats_required = FEAT_GCS,
+	.timeout = 3,
+	.run = gcs_regs,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c
new file mode 100644
index 000000000000..faeabb18c4b2
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static uint64_t *gcs_page;
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+static bool alloc_gcs(struct tdescr *td)
+{
+	long page_size = sysconf(_SC_PAGE_SIZE);
+
+	gcs_page = (void *)syscall(__NR_map_shadow_stack, 0,
+				   page_size, 0);
+	if (gcs_page == MAP_FAILED) {
+		fprintf(stderr, "Failed to map %ld byte GCS: %d\n",
+			page_size, errno);
+		return false;
+	}
+
+	return true;
+}
+
+static int gcs_write_fault_trigger(struct tdescr *td)
+{
+	/* Verify that the page is readable (ie, not completely unmapped) */
+	fprintf(stderr, "Read value 0x%lx\n", gcs_page[0]);
+
+	/* A regular write should trigger a fault */
+	gcs_page[0] = EINVAL;
+
+	return 0;
+}
+
+static int gcs_write_fault_signal(struct tdescr *td, siginfo_t *si,
+				  ucontext_t *uc)
+{
+	ASSERT_GOOD_CONTEXT(uc);
+
+	return 1;
+}
+
+
+struct tdescr tde = {
+	.name = "GCS write fault",
+	.descr = "Normal writes to a GCS segfault",
+	.feats_required = FEAT_GCS,
+	.timeout = 3,
+	.sig_ok = SIGSEGV,
+	.sanity_disabled = true,
+	.init = alloc_gcs,
+	.trigger = gcs_write_fault_trigger,
+	.run = gcs_write_fault_signal,
+};

From 05e6cfff58c481bfe0ada24ebe1c205e2817dacd Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:17 +0100
Subject: [PATCH 037/168] kselftest/arm64: Add a GCS stress test

Add a stress test which runs one more process than we have CPUs spinning
through a very recursive function with frequent syscalls immediately prior
to return and signals being injected every 100ms. The goal is to flag up
any scheduling related issues, for example failure to ensure that barriers
are inserted when moving a GCS using task to another CPU. The test runs for
a configurable amount of time, defaulting to 10 seconds.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-38-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/.gitignore  |   2 +
 tools/testing/selftests/arm64/gcs/Makefile    |   6 +-
 .../testing/selftests/arm64/gcs/asm-offsets.h |   0
 .../selftests/arm64/gcs/gcs-stress-thread.S   | 311 ++++++++++
 .../testing/selftests/arm64/gcs/gcs-stress.c  | 530 ++++++++++++++++++
 5 files changed, 848 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/asm-offsets.h
 create mode 100644 tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
 create mode 100644 tools/testing/selftests/arm64/gcs/gcs-stress.c

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
index 0c86f53f68ad..1e8d1f6b27f2 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -1,3 +1,5 @@
 basic-gcs
 libc-gcs
 gcs-locking
+gcs-stress
+gcs-stress-thread
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index 2173d6275956..d8b06ca51e22 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,8 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress
+TEST_GEN_PROGS_EXTENDED := gcs-stress-thread
 
 LDLIBS+=-lpthread
 
@@ -18,3 +19,6 @@ $(OUTPUT)/basic-gcs: basic-gcs.c
 		-I../../../../../usr/include \
 		-std=gnu99 -I../.. -g \
 		-ffreestanding -Wall $^ -o $@ -lgcc
+
+$(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S
+	$(CC) -nostdlib $^ -o $@
diff --git a/tools/testing/selftests/arm64/gcs/asm-offsets.h b/tools/testing/selftests/arm64/gcs/asm-offsets.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
new file mode 100644
index 000000000000..b88b25217da5
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
@@ -0,0 +1,311 @@
+// Program that loops for ever doing lots of recursions and system calls,
+// intended to be used as part of a stress test for GCS context switching.
+//
+// Copyright 2015-2023 Arm Ltd
+
+#include <asm/unistd.h>
+
+#define sa_sz 32
+#define sa_flags 8
+#define sa_handler 0
+#define sa_mask_sz 8
+
+#define si_code 8
+
+#define SIGINT 2
+#define SIGABRT 6
+#define SIGUSR1 10
+#define SIGSEGV 11
+#define SIGUSR2 12
+#define SIGTERM 15
+#define SEGV_CPERR 10
+
+#define SA_NODEFER 1073741824
+#define SA_SIGINFO 4
+#define ucontext_regs 184
+
+#define PR_SET_SHADOW_STACK_STATUS      75
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+
+#define	GCSPR_EL0 S3_3_C2_C5_1
+
+.macro function name
+	.macro endfunction
+		.type \name, @function
+		.purgem endfunction
+	.endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+	str	x0, [sp, #-16]!
+
+	mov	x0, #1			// STDOUT_FILENO
+	mov	x1, sp
+	mov	x2, #1
+	mov	x8, #__NR_write
+	svc	#0
+
+	add	sp, sp, #16
+	ret
+endfunction
+.globl	putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+	mov	x1, x0
+
+	mov	x2, #0
+0:	ldrb	w3, [x0], #1
+	cbz	w3, 1f
+	add	x2, x2, #1
+	b	0b
+
+1:	mov	w0, #1			// STDOUT_FILENO
+	mov	x8, #__NR_write
+	svc	#0
+
+	ret
+endfunction
+.globl	puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+	.pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+	.popsection
+
+	ldr	x0, =.L__puts_literal\@
+	bl	puts
+.endm
+
+// Print an unsigned decimal number x0 to stdout
+// Clobbers x0-x4,x8
+function putdec
+	mov	x1, sp
+	str	x30, [sp, #-32]!	// Result can't be > 20 digits
+
+	mov	x2, #0
+	strb	w2, [x1, #-1]!		// Write the NUL terminator
+
+	mov	x2, #10
+0:	udiv	x3, x0, x2		// div-mod loop to generate the digits
+	msub	x0, x3, x2, x0
+	add	w0, w0, #'0'
+	strb	w0, [x1, #-1]!
+	mov	x0, x3
+	cbnz	x3, 0b
+
+	ldrb	w0, [x1]
+	cbnz	w0, 1f
+	mov	w0, #'0'		// Print "0" for 0, not ""
+	strb	w0, [x1, #-1]!
+
+1:	mov	x0, x1
+	bl	puts
+
+	ldr	x30, [sp], #32
+	ret
+endfunction
+.globl	putdec
+
+// Print an unsigned decimal number x0 to stdout, followed by a newline
+// Clobbers x0-x5,x8
+function putdecn
+	mov	x5, x30
+
+	bl	putdec
+	mov	x0, #'\n'
+	bl	putc
+
+	ret	x5
+endfunction
+.globl	putdecn
+
+// Fill x1 bytes starting at x0 with 0.
+// Clobbers x1, x2.
+function memclr
+	mov	w2, #0
+endfunction
+.globl	memclr
+	// fall through to memfill
+
+// Trivial memory fill: fill x1 bytes starting at address x0 with byte w2
+// Clobbers x1
+function memfill
+	cmp	x1, #0
+	b.eq	1f
+
+0:	strb	w2, [x0], #1
+	subs	x1, x1, #1
+	b.ne	0b
+
+1:	ret
+endfunction
+.globl	memfill
+
+// w0: signal number
+// x1: sa_action
+// w2: sa_flags
+// Clobbers x0-x6,x8
+function setsignal
+	str	x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]!
+
+	mov	w4, w0
+	mov	x5, x1
+	mov	w6, w2
+
+	add	x0, sp, #16
+	mov	x1, #sa_sz
+	bl	memclr
+
+	mov	w0, w4
+	add	x1, sp, #16
+	str	w6, [x1, #sa_flags]
+	str	x5, [x1, #sa_handler]
+	mov	x2, #0
+	mov	x3, #sa_mask_sz
+	mov	x8, #__NR_rt_sigaction
+	svc	#0
+
+	cbz	w0, 1f
+
+	puts	"sigaction failure\n"
+	b	abort
+
+1:	ldr	x30, [sp], #((sa_sz + 15) / 16 * 16 + 16)
+	ret
+endfunction
+
+
+function tickle_handler
+	// Perhaps collect GCSPR_EL0 here in future?
+	ret
+endfunction
+
+function terminate_handler
+	mov	w21, w0
+	mov	x20, x2
+
+	puts	"Terminated by signal "
+	mov	w0, w21
+	bl	putdec
+	puts	", no error\n"
+
+	mov	x0, #0
+	mov	x8, #__NR_exit
+	svc	#0
+endfunction
+
+function segv_handler
+	// stash the siginfo_t *
+	mov	x20, x1
+
+	// Disable GCS, we don't want additional faults logging things
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, xzr
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+
+	puts	"Got SIGSEGV code "
+
+	ldr	x21, [x20, #si_code]
+	mov	x0, x21
+	bl	putdec
+
+	// GCS faults should have si_code SEGV_CPERR
+	cmp	x21, #SEGV_CPERR
+	bne	1f
+
+	puts	" (GCS violation)"
+1:
+	mov	x0, '\n'
+	bl	putc
+	b	abort
+endfunction
+
+// Recurse x20 times
+.macro recurse id
+function recurse\id
+	stp	x29, x30, [sp, #-16]!
+	mov	x29, sp
+
+	cmp	x20, 0
+	beq	1f
+	sub	x20, x20, 1
+	bl	recurse\id
+
+1:
+	ldp	x29, x30, [sp], #16
+
+	// Do a syscall immediately prior to returning to try to provoke
+	// scheduling and migration at a point where coherency issues
+	// might trigger.
+	mov	x8, #__NR_getpid
+	svc	#0
+
+	ret
+endfunction
+.endm
+
+// Generate and use two copies so we're changing the GCS contents
+recurse 1
+recurse 2
+
+.globl _start
+function _start
+	// Run with GCS
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+	cbz	x0, 1f
+	puts	"Failed to enable GCS\n"
+	b	abort
+1:
+
+	mov	w0, #SIGTERM
+	adr	x1, terminate_handler
+	mov	w2, #SA_SIGINFO
+	bl	setsignal
+
+	mov	w0, #SIGUSR1
+	adr	x1, tickle_handler
+	mov	w2, #SA_SIGINFO
+	orr	w2, w2, #SA_NODEFER
+	bl	setsignal
+
+	mov	w0, #SIGSEGV
+	adr	x1, segv_handler
+	mov	w2, #SA_SIGINFO
+	orr	w2, w2, #SA_NODEFER
+	bl	setsignal
+
+	puts	"Running\n"
+
+loop:
+	// Small recursion depth so we're frequently flipping between
+	// the two recursors and changing what's on the stack
+	mov	x20, #5
+	bl	recurse1
+	mov	x20, #5
+	bl	recurse2
+	b	loop
+endfunction
+
+abort:
+	mov	x0, #255
+	mov	x8, #__NR_exit
+	svc	#0
diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c
new file mode 100644
index 000000000000..bdec7ee8cfd5
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022-3 ARM Limited.
+ */
+
+#define _GNU_SOURCE
+#define _POSIX_C_SOURCE 199309L
+
+#include <errno.h>
+#include <getopt.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/epoll.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <asm/hwcap.h>
+
+#include "../../kselftest.h"
+
+struct child_data {
+	char *name, *output;
+	pid_t pid;
+	int stdout;
+	bool output_seen;
+	bool exited;
+	int exit_status;
+	int exit_signal;
+};
+
+static int epoll_fd;
+static struct child_data *children;
+static struct epoll_event *evs;
+static int tests;
+static int num_children;
+static bool terminate;
+
+static int startup_pipe[2];
+
+static int num_processors(void)
+{
+	long nproc = sysconf(_SC_NPROCESSORS_CONF);
+	if (nproc < 0) {
+		perror("Unable to read number of processors\n");
+		exit(EXIT_FAILURE);
+	}
+
+	return nproc;
+}
+
+static void start_thread(struct child_data *child)
+{
+	int ret, pipefd[2], i;
+	struct epoll_event ev;
+
+	ret = pipe(pipefd);
+	if (ret != 0)
+		ksft_exit_fail_msg("Failed to create stdout pipe: %s (%d)\n",
+				   strerror(errno), errno);
+
+	child->pid = fork();
+	if (child->pid == -1)
+		ksft_exit_fail_msg("fork() failed: %s (%d)\n",
+				   strerror(errno), errno);
+
+	if (!child->pid) {
+		/*
+		 * In child, replace stdout with the pipe, errors to
+		 * stderr from here as kselftest prints to stdout.
+		 */
+		ret = dup2(pipefd[1], 1);
+		if (ret == -1) {
+			fprintf(stderr, "dup2() %d\n", errno);
+			exit(EXIT_FAILURE);
+		}
+
+		/*
+		 * Duplicate the read side of the startup pipe to
+		 * FD 3 so we can close everything else.
+		 */
+		ret = dup2(startup_pipe[0], 3);
+		if (ret == -1) {
+			fprintf(stderr, "dup2() %d\n", errno);
+			exit(EXIT_FAILURE);
+		}
+
+		/*
+		 * Very dumb mechanism to clean open FDs other than
+		 * stdio. We don't want O_CLOEXEC for the pipes...
+		 */
+		for (i = 4; i < 8192; i++)
+			close(i);
+
+		/*
+		 * Read from the startup pipe, there should be no data
+		 * and we should block until it is closed.  We just
+		 * carry on on error since this isn't super critical.
+		 */
+		ret = read(3, &i, sizeof(i));
+		if (ret < 0)
+			fprintf(stderr, "read(startp pipe) failed: %s (%d)\n",
+				strerror(errno), errno);
+		if (ret > 0)
+			fprintf(stderr, "%d bytes of data on startup pipe\n",
+				ret);
+		close(3);
+
+		ret = execl("gcs-stress-thread", "gcs-stress-thread", NULL);
+		fprintf(stderr, "execl(gcs-stress-thread) failed: %d (%s)\n",
+			errno, strerror(errno));
+
+		exit(EXIT_FAILURE);
+	} else {
+		/*
+		 * In parent, remember the child and close our copy of the
+		 * write side of stdout.
+		 */
+		close(pipefd[1]);
+		child->stdout = pipefd[0];
+		child->output = NULL;
+		child->exited = false;
+		child->output_seen = false;
+
+		ev.events = EPOLLIN | EPOLLHUP;
+		ev.data.ptr = child;
+
+		ret = asprintf(&child->name, "Thread-%d", child->pid);
+		if (ret == -1)
+			ksft_exit_fail_msg("asprintf() failed\n");
+
+		ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, child->stdout, &ev);
+		if (ret < 0) {
+			ksft_exit_fail_msg("%s EPOLL_CTL_ADD failed: %s (%d)\n",
+					   child->name, strerror(errno), errno);
+		}
+	}
+
+	ksft_print_msg("Started %s\n", child->name);
+	num_children++;
+}
+
+static bool child_output_read(struct child_data *child)
+{
+	char read_data[1024];
+	char work[1024];
+	int ret, len, cur_work, cur_read;
+
+	ret = read(child->stdout, read_data, sizeof(read_data));
+	if (ret < 0) {
+		if (errno == EINTR)
+			return true;
+
+		ksft_print_msg("%s: read() failed: %s (%d)\n",
+			       child->name, strerror(errno),
+			       errno);
+		return false;
+	}
+	len = ret;
+
+	child->output_seen = true;
+
+	/* Pick up any partial read */
+	if (child->output) {
+		strncpy(work, child->output, sizeof(work) - 1);
+		cur_work = strnlen(work, sizeof(work));
+		free(child->output);
+		child->output = NULL;
+	} else {
+		cur_work = 0;
+	}
+
+	cur_read = 0;
+	while (cur_read < len) {
+		work[cur_work] = read_data[cur_read++];
+
+		if (work[cur_work] == '\n') {
+			work[cur_work] = '\0';
+			ksft_print_msg("%s: %s\n", child->name, work);
+			cur_work = 0;
+		} else {
+			cur_work++;
+		}
+	}
+
+	if (cur_work) {
+		work[cur_work] = '\0';
+		ret = asprintf(&child->output, "%s", work);
+		if (ret == -1)
+			ksft_exit_fail_msg("Out of memory\n");
+	}
+
+	return false;
+}
+
+static void child_output(struct child_data *child, uint32_t events,
+			 bool flush)
+{
+	bool read_more;
+
+	if (events & EPOLLIN) {
+		do {
+			read_more = child_output_read(child);
+		} while (read_more);
+	}
+
+	if (events & EPOLLHUP) {
+		close(child->stdout);
+		child->stdout = -1;
+		flush = true;
+	}
+
+	if (flush && child->output) {
+		ksft_print_msg("%s: %s<EOF>\n", child->name, child->output);
+		free(child->output);
+		child->output = NULL;
+	}
+}
+
+static void child_tickle(struct child_data *child)
+{
+	if (child->output_seen && !child->exited)
+		kill(child->pid, SIGUSR1);
+}
+
+static void child_stop(struct child_data *child)
+{
+	if (!child->exited)
+		kill(child->pid, SIGTERM);
+}
+
+static void child_cleanup(struct child_data *child)
+{
+	pid_t ret;
+	int status;
+	bool fail = false;
+
+	if (!child->exited) {
+		do {
+			ret = waitpid(child->pid, &status, 0);
+			if (ret == -1 && errno == EINTR)
+				continue;
+
+			if (ret == -1) {
+				ksft_print_msg("waitpid(%d) failed: %s (%d)\n",
+					       child->pid, strerror(errno),
+					       errno);
+				fail = true;
+				break;
+			}
+
+			if (WIFEXITED(status)) {
+				child->exit_status = WEXITSTATUS(status);
+				child->exited = true;
+			}
+
+			if (WIFSIGNALED(status)) {
+				child->exit_signal = WTERMSIG(status);
+				ksft_print_msg("%s: Exited due to signal %d\n",
+					       child->name);
+				fail = true;
+				child->exited = true;
+			}
+		} while (!child->exited);
+	}
+
+	if (!child->output_seen) {
+		ksft_print_msg("%s no output seen\n", child->name);
+		fail = true;
+	}
+
+	if (child->exit_status != 0) {
+		ksft_print_msg("%s exited with error code %d\n",
+			       child->name, child->exit_status);
+		fail = true;
+	}
+
+	ksft_test_result(!fail, "%s\n", child->name);
+}
+
+static void handle_child_signal(int sig, siginfo_t *info, void *context)
+{
+	int i;
+	bool found = false;
+
+	for (i = 0; i < num_children; i++) {
+		if (children[i].pid == info->si_pid) {
+			children[i].exited = true;
+			children[i].exit_status = info->si_status;
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		ksft_print_msg("SIGCHLD for unknown PID %d with status %d\n",
+			       info->si_pid, info->si_status);
+}
+
+static void handle_exit_signal(int sig, siginfo_t *info, void *context)
+{
+	int i;
+
+	/* If we're already exiting then don't signal again */
+	if (terminate)
+		return;
+
+	ksft_print_msg("Got signal, exiting...\n");
+
+	terminate = true;
+
+	/*
+	 * This should be redundant, the main loop should clean up
+	 * after us, but for safety stop everything we can here.
+	 */
+	for (i = 0; i < num_children; i++)
+		child_stop(&children[i]);
+}
+
+/* Handle any pending output without blocking */
+static void drain_output(bool flush)
+{
+	int ret = 1;
+	int i;
+
+	while (ret > 0) {
+		ret = epoll_wait(epoll_fd, evs, tests, 0);
+		if (ret < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_print_msg("epoll_wait() failed: %s (%d)\n",
+				       strerror(errno), errno);
+		}
+
+		for (i = 0; i < ret; i++)
+			child_output(evs[i].data.ptr, evs[i].events, flush);
+	}
+}
+
+static const struct option options[] = {
+	{ "timeout",	required_argument, NULL, 't' },
+	{ }
+};
+
+int main(int argc, char **argv)
+{
+	int seen_children;
+	bool all_children_started = false;
+	int gcs_threads;
+	int timeout = 10;
+	int ret, cpus, i, c;
+	struct sigaction sa;
+
+	while ((c = getopt_long(argc, argv, "t:", options, NULL)) != -1) {
+		switch (c) {
+		case 't':
+			ret = sscanf(optarg, "%d", &timeout);
+			if (ret != 1)
+				ksft_exit_fail_msg("Failed to parse timeout %s\n",
+						   optarg);
+			break;
+		default:
+			ksft_exit_fail_msg("Unknown argument\n");
+		}
+	}
+
+	cpus = num_processors();
+	tests = 0;
+
+	if (getauxval(AT_HWCAP) & HWCAP_GCS) {
+		/* One extra thread, trying to trigger migrations */
+		gcs_threads = cpus + 1;
+		tests += gcs_threads;
+	} else {
+		gcs_threads = 0;
+	}
+
+	ksft_print_header();
+	ksft_set_plan(tests);
+
+	ksft_print_msg("%d CPUs, %d GCS threads\n",
+		       cpus, gcs_threads);
+
+	if (!tests)
+		ksft_exit_skip("No tests scheduled\n");
+
+	if (timeout > 0)
+		ksft_print_msg("Will run for %ds\n", timeout);
+	else
+		ksft_print_msg("Will run until terminated\n");
+
+	children = calloc(sizeof(*children), tests);
+	if (!children)
+		ksft_exit_fail_msg("Unable to allocate child data\n");
+
+	ret = epoll_create1(EPOLL_CLOEXEC);
+	if (ret < 0)
+		ksft_exit_fail_msg("epoll_create1() failed: %s (%d)\n",
+				   strerror(errno), ret);
+	epoll_fd = ret;
+
+	/* Create a pipe which children will block on before execing */
+	ret = pipe(startup_pipe);
+	if (ret != 0)
+		ksft_exit_fail_msg("Failed to create startup pipe: %s (%d)\n",
+				   strerror(errno), errno);
+
+	/* Get signal handers ready before we start any children */
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handle_exit_signal;
+	sa.sa_flags = SA_RESTART | SA_SIGINFO;
+	sigemptyset(&sa.sa_mask);
+	ret = sigaction(SIGINT, &sa, NULL);
+	if (ret < 0)
+		ksft_print_msg("Failed to install SIGINT handler: %s (%d)\n",
+			       strerror(errno), errno);
+	ret = sigaction(SIGTERM, &sa, NULL);
+	if (ret < 0)
+		ksft_print_msg("Failed to install SIGTERM handler: %s (%d)\n",
+			       strerror(errno), errno);
+	sa.sa_sigaction = handle_child_signal;
+	ret = sigaction(SIGCHLD, &sa, NULL);
+	if (ret < 0)
+		ksft_print_msg("Failed to install SIGCHLD handler: %s (%d)\n",
+			       strerror(errno), errno);
+
+	evs = calloc(tests, sizeof(*evs));
+	if (!evs)
+		ksft_exit_fail_msg("Failed to allocated %d epoll events\n",
+				   tests);
+
+	for (i = 0; i < gcs_threads; i++)
+		start_thread(&children[i]);
+
+	/*
+	 * All children started, close the startup pipe and let them
+	 * run.
+	 */
+	close(startup_pipe[0]);
+	close(startup_pipe[1]);
+
+	timeout *= 10;
+	for (;;) {
+		/* Did we get a signal asking us to exit? */
+		if (terminate)
+			break;
+
+		/*
+		 * Timeout is counted in 100ms with no output, the
+		 * tests print during startup then are silent when
+		 * running so this should ensure they all ran enough
+		 * to install the signal handler, this is especially
+		 * useful in emulation where we will both be slow and
+		 * likely to have a large set of VLs.
+		 */
+		ret = epoll_wait(epoll_fd, evs, tests, 100);
+		if (ret < 0) {
+			if (errno == EINTR)
+				continue;
+			ksft_exit_fail_msg("epoll_wait() failed: %s (%d)\n",
+					   strerror(errno), errno);
+		}
+
+		/* Output? */
+		if (ret > 0) {
+			for (i = 0; i < ret; i++) {
+				child_output(evs[i].data.ptr, evs[i].events,
+					     false);
+			}
+			continue;
+		}
+
+		/* Otherwise epoll_wait() timed out */
+
+		/*
+		 * If the child processes have not produced output they
+		 * aren't actually running the tests yet.
+		 */
+		if (!all_children_started) {
+			seen_children = 0;
+
+			for (i = 0; i < num_children; i++)
+				if (children[i].output_seen ||
+				    children[i].exited)
+					seen_children++;
+
+			if (seen_children != num_children) {
+				ksft_print_msg("Waiting for %d children\n",
+					       num_children - seen_children);
+				continue;
+			}
+
+			all_children_started = true;
+		}
+
+		ksft_print_msg("Sending signals, timeout remaining: %d00ms\n",
+			       timeout);
+
+		for (i = 0; i < num_children; i++)
+			child_tickle(&children[i]);
+
+		/* Negative timeout means run indefinitely */
+		if (timeout < 0)
+			continue;
+		if (--timeout == 0)
+			break;
+	}
+
+	ksft_print_msg("Finishing up...\n");
+	terminate = true;
+
+	for (i = 0; i < tests; i++)
+		child_stop(&children[i]);
+
+	drain_output(false);
+
+	for (i = 0; i < tests; i++)
+		child_cleanup(&children[i]);
+
+	drain_output(true);
+
+	ksft_finished();
+}

From bb9ae1a66c85eeb626864efd812c62026e126ec0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 1 Oct 2024 23:59:18 +0100
Subject: [PATCH 038/168] kselftest/arm64: Enable GCS for the FP stress tests

While it's a bit off topic for them the floating point stress tests do give
us some coverage of context thrashing cases, and also of active signal
delivery separate to the relatively complicated framework in the actual
signals tests. Have the tests enable GCS on startup, ignoring failures so
they continue to work as before on systems without GCS.

Reviewed-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Tested-by: Thiago Jung Bauermann <thiago.bauermann@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241001-arm64-gcs-v13-39-222b78d87eee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/assembler.h   | 15 +++++++++++++++
 tools/testing/selftests/arm64/fp/fpsimd-test.S |  2 ++
 tools/testing/selftests/arm64/fp/sve-test.S    |  2 ++
 tools/testing/selftests/arm64/fp/za-test.S     |  2 ++
 tools/testing/selftests/arm64/fp/zt-test.S     |  2 ++
 5 files changed, 23 insertions(+)

diff --git a/tools/testing/selftests/arm64/fp/assembler.h b/tools/testing/selftests/arm64/fp/assembler.h
index 9b38a0da407d..1fc46a5642c2 100644
--- a/tools/testing/selftests/arm64/fp/assembler.h
+++ b/tools/testing/selftests/arm64/fp/assembler.h
@@ -65,4 +65,19 @@ endfunction
 	bl	puts
 .endm
 
+#define PR_SET_SHADOW_STACK_STATUS      75
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+
+.macro enable_gcs
+	// Run with GCS
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+.endm
+
 #endif /* ! ASSEMBLER_H */
diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index 8b960d01ed2e..b16fb7f42e3e 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -215,6 +215,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+	enable_gcs
+
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index fff60e2a25ad..2fb4f0b84476 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -378,6 +378,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+	enable_gcs
+
 	mov	x23, #0		// Irritation signal count
 
 	mov	w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index 095b45531640..b2603aba99de 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -231,6 +231,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+	enable_gcs
+
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index b5c81e81a379..8d9609a49008 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -200,6 +200,8 @@ endfunction
 // Main program entry point
 .globl _start
 function _start
+	enable_gcs
+
 	mov	x23, #0		// signal count
 
 	mov	w0, #SIGINT

From cc847678998305c72c9850efa3fd040b274a8180 Mon Sep 17 00:00:00 2001
From: Yunhui Cui <cuiyunhui@bytedance.com>
Date: Thu, 19 Sep 2024 11:46:01 +0800
Subject: [PATCH 039/168] drivers perf: remove unused field pmu_node

The driver does not use the pmu_node field, so remove it.

Signed-off-by: Yunhui Cui <cuiyunhui@bytedance.com>
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240919034601.2453-1-cuiyunhui@bytedance.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/dwc_pcie_pmu.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
index 4ca50f9b6dfe..59526a48499f 100644
--- a/drivers/perf/dwc_pcie_pmu.c
+++ b/drivers/perf/dwc_pcie_pmu.c
@@ -82,7 +82,6 @@ struct dwc_pcie_pmu {
 	u16			ras_des_offset;
 	u32			nr_lanes;
 
-	struct list_head	pmu_node;
 	struct hlist_node	cpuhp_node;
 	struct perf_event	*event[DWC_PCIE_EVENT_TYPE_MAX];
 	int			on_cpu;

From 6105c5d46d0b39ccf01e329fc4449ba840c2937d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 8 Oct 2024 16:58:49 +0100
Subject: [PATCH 040/168] arm64: probes: Move kprobes-specific fields

We share struct arch_probe_insn between krpboes and uprobes, but most of
its fields aren't necessary for uprobes:

* The 'insn' field is only used by kprobes as a pointer to the XOL slot.

* The 'restore' field is only used by probes as the PC to restore after
  stepping an instruction in the XOL slot.

* The 'pstate_cc' field isn't used by kprobes or uprobes, and seems to
  only exist as a result of copy-pasting the 32-bit arm implementation
  of kprobes.

As these fields live in struct arch_probe_insn they cannot use
definitions that only exist when CONFIG_KPROBES=y, such as the
kprobe_opcode_t typedef, which we'd like to use in subsequent patches.

Clean this up by removing the 'pstate_cc' field, and moving the
kprobes-specific fields into the kprobes-specific struct
arch_specific_insn. To make it clear that the fields are related to
stepping instructions in the XOL slot, 'insn' is renamed to 'xol_insn'
and 'restore' is renamed to 'xol_restore'

At the same time, remove the misleading and useless comment above struct
arch_probe_insn.

The should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241008155851.801546-5-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/probes.h    |  8 +++-----
 arch/arm64/kernel/probes/kprobes.c | 30 +++++++++++++++---------------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h
index 006946745352..4aa54322794d 100644
--- a/arch/arm64/include/asm/probes.h
+++ b/arch/arm64/include/asm/probes.h
@@ -12,18 +12,16 @@
 typedef u32 probe_opcode_t;
 typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *);
 
-/* architecture specific copy of original instruction */
 struct arch_probe_insn {
-	probe_opcode_t *insn;
-	pstate_check_t *pstate_cc;
 	probes_handler_t *handler;
-	/* restore address after step xol */
-	unsigned long restore;
 };
 #ifdef CONFIG_KPROBES
 typedef u32 kprobe_opcode_t;
 struct arch_specific_insn {
 	struct arch_probe_insn api;
+	probe_opcode_t *xol_insn;
+	/* restore address after step xol */
+	unsigned long xol_restore;
 };
 #endif
 
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 4268678d0e86..222419a41a40 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -43,7 +43,7 @@ post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *);
 
 static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
 {
-	kprobe_opcode_t *addr = p->ainsn.api.insn;
+	kprobe_opcode_t *addr = p->ainsn.xol_insn;
 
 	/*
 	 * Prepare insn slot, Mark Rutland points out it depends on a coupe of
@@ -70,14 +70,14 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
 	/*
 	 * Needs restoring of return address after stepping xol.
 	 */
-	p->ainsn.api.restore = (unsigned long) p->addr +
+	p->ainsn.xol_restore = (unsigned long) p->addr +
 	  sizeof(kprobe_opcode_t);
 }
 
 static void __kprobes arch_prepare_simulate(struct kprobe *p)
 {
 	/* This instructions is not executed xol. No need to adjust the PC */
-	p->ainsn.api.restore = 0;
+	p->ainsn.xol_restore = 0;
 }
 
 static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs)
@@ -110,18 +110,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 		return -EINVAL;
 
 	case INSN_GOOD_NO_SLOT:	/* insn need simulation */
-		p->ainsn.api.insn = NULL;
+		p->ainsn.xol_insn = NULL;
 		break;
 
 	case INSN_GOOD:	/* instruction uses slot */
-		p->ainsn.api.insn = get_insn_slot();
-		if (!p->ainsn.api.insn)
+		p->ainsn.xol_insn = get_insn_slot();
+		if (!p->ainsn.xol_insn)
 			return -ENOMEM;
 		break;
 	}
 
 	/* prepare the instruction */
-	if (p->ainsn.api.insn)
+	if (p->ainsn.xol_insn)
 		arch_prepare_ss_slot(p);
 	else
 		arch_prepare_simulate(p);
@@ -148,9 +148,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
-	if (p->ainsn.api.insn) {
-		free_insn_slot(p->ainsn.api.insn, 0);
-		p->ainsn.api.insn = NULL;
+	if (p->ainsn.xol_insn) {
+		free_insn_slot(p->ainsn.xol_insn, 0);
+		p->ainsn.xol_insn = NULL;
 	}
 }
 
@@ -205,9 +205,9 @@ static void __kprobes setup_singlestep(struct kprobe *p,
 	}
 
 
-	if (p->ainsn.api.insn) {
+	if (p->ainsn.xol_insn) {
 		/* prepare for single stepping */
-		slot = (unsigned long)p->ainsn.api.insn;
+		slot = (unsigned long)p->ainsn.xol_insn;
 
 		kprobes_save_local_irqflag(kcb, regs);
 		instruction_pointer_set(regs, slot);
@@ -245,8 +245,8 @@ static void __kprobes
 post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs)
 {
 	/* return addr restore if non-branching insn */
-	if (cur->ainsn.api.restore != 0)
-		instruction_pointer_set(regs, cur->ainsn.api.restore);
+	if (cur->ainsn.xol_restore != 0)
+		instruction_pointer_set(regs, cur->ainsn.xol_restore);
 
 	/* restore back original saved kprobe variables and continue */
 	if (kcb->kprobe_status == KPROBE_REENTER) {
@@ -348,7 +348,7 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr)
 	struct kprobe *cur = kprobe_running();
 
 	if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) &&
-	    ((unsigned long)&cur->ainsn.api.insn[1] == addr)) {
+	    ((unsigned long)&cur->ainsn.xol_insn[1] == addr)) {
 		kprobes_restore_local_irqflag(kcb, regs);
 		post_kprobe_handler(cur, kcb, regs);
 

From dd0eb50e7c71fdd74f2ab0ef4b60bd6b27bbc670 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 8 Oct 2024 16:58:50 +0100
Subject: [PATCH 041/168] arm64: probes: Cleanup kprobes endianness conversions

The core kprobes code uses kprobe_opcode_t for the in-memory
representation of an instruction, using 'kprobe_opcode_t *' for XOL
slots. As arm64 instructions are always little-endian 32-bit values,
kprobes_opcode_t should be __le32, but at the moment kprobe_opcode_t
is typedef'd to u32.

Today there is no functional issue as we convert values via
cpu_to_le32() and le32_to_cpu() where necessary, but these conversions
are inconsistent with the types used, causing sparse warnings:

|   CHECK   arch/arm64/kernel/probes/kprobes.c
| arch/arm64/kernel/probes/kprobes.c:102:21: warning: cast to restricted __le32
|   CHECK   arch/arm64/kernel/probes/decode-insn.c
| arch/arm64/kernel/probes/decode-insn.c:122:46: warning: cast to restricted __le32
| arch/arm64/kernel/probes/decode-insn.c:124:50: warning: cast to restricted __le32
| arch/arm64/kernel/probes/decode-insn.c:136:31: warning: cast to restricted __le32

Improve this by making kprobes_opcode_t a typedef for __le32 and
consistently using this for pointers to executable instructions. With
this change we can rely on the type system to tell us where conversions
are necessary.

Since kprobe::opcode is changed from u32 to __le32, the existing
le32_to_cpu() converion moves from the point this is initialized (in
arch_prepare_kprobe()) to the points this is consumed when passed to
a handler or text patching function. As kprobe::opcode isn't altered or
consumed elsewhere, this shouldn't result in a functional change.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241008155851.801546-6-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/probes.h        | 4 ++--
 arch/arm64/kernel/probes/decode-insn.c | 2 +-
 arch/arm64/kernel/probes/kprobes.c     | 9 +++++----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h
index 4aa54322794d..11e809733b7d 100644
--- a/arch/arm64/include/asm/probes.h
+++ b/arch/arm64/include/asm/probes.h
@@ -16,10 +16,10 @@ struct arch_probe_insn {
 	probes_handler_t *handler;
 };
 #ifdef CONFIG_KPROBES
-typedef u32 kprobe_opcode_t;
+typedef __le32 kprobe_opcode_t;
 struct arch_specific_insn {
 	struct arch_probe_insn api;
-	probe_opcode_t *xol_insn;
+	kprobe_opcode_t *xol_insn;
 	/* restore address after step xol */
 	unsigned long xol_restore;
 };
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index 3496d6169e59..147d6ddf3a4c 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -134,7 +134,7 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi)
 {
 	enum probe_insn decoded;
 	probe_opcode_t insn = le32_to_cpu(*addr);
-	probe_opcode_t *scan_end = NULL;
+	kprobe_opcode_t *scan_end = NULL;
 	unsigned long size = 0, offset = 0;
 	struct arch_probe_insn *api = &asi->api;
 
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 222419a41a40..48d88e07611d 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -64,7 +64,7 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
 	 * the BRK exception handler, so it is unnecessary to generate
 	 * Contex-Synchronization-Event via ISB again.
 	 */
-	aarch64_insn_patch_text_nosync(addr, p->opcode);
+	aarch64_insn_patch_text_nosync(addr, le32_to_cpu(p->opcode));
 	aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS);
 
 	/*
@@ -85,7 +85,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	if (p->ainsn.api.handler)
-		p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs);
+		p->ainsn.api.handler(le32_to_cpu(p->opcode), (long)p->addr, regs);
 
 	/* single step simulated, now go for post processing */
 	post_kprobe_handler(p, kcb, regs);
@@ -99,7 +99,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 		return -EINVAL;
 
 	/* copy instruction */
-	p->opcode = le32_to_cpu(*p->addr);
+	p->opcode = *p->addr;
 
 	if (search_exception_tables(probe_addr))
 		return -EINVAL;
@@ -142,8 +142,9 @@ void __kprobes arch_arm_kprobe(struct kprobe *p)
 void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
 	void *addr = p->addr;
+	u32 insn = le32_to_cpu(p->opcode);
 
-	aarch64_insn_patch_text(&addr, &p->opcode, 1);
+	aarch64_insn_patch_text(&addr, &insn, 1);
 }
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)

From 14762109de024837dafcb893d45ccaf6a08ac990 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 8 Oct 2024 16:58:51 +0100
Subject: [PATCH 042/168] arm64: probes: Remove probe_opcode_t

The probe_opcode_t typedef for u32 isn't necessary, and is a source of
confusion as it is easily confused with kprobe_opcode_t, which is a
typedef for __le32.

The typedef is only used within arch/arm64, and all of arm64's commn
insn code uses u32 for the endian-agnostic value of an instruction, so
it'd be clearer to use u32 consistently.

Remove probe_opcode_t and use u32 directly.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marnias@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241008155851.801546-7-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/probes.h        | 1 -
 arch/arm64/kernel/probes/decode-insn.c | 4 ++--
 arch/arm64/kernel/probes/decode-insn.h | 2 +-
 arch/arm64/kernel/probes/uprobes.c     | 4 ++--
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h
index 11e809733b7d..d49368886309 100644
--- a/arch/arm64/include/asm/probes.h
+++ b/arch/arm64/include/asm/probes.h
@@ -9,7 +9,6 @@
 
 #include <asm/insn.h>
 
-typedef u32 probe_opcode_t;
 typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *);
 
 struct arch_probe_insn {
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index 147d6ddf3a4c..41b100bcb041 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -73,7 +73,7 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
  *   INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot.
  */
 enum probe_insn __kprobes
-arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api)
+arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api)
 {
 	/*
 	 * Instructions reading or modifying the PC won't work from the XOL
@@ -133,7 +133,7 @@ enum probe_insn __kprobes
 arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi)
 {
 	enum probe_insn decoded;
-	probe_opcode_t insn = le32_to_cpu(*addr);
+	u32 insn = le32_to_cpu(*addr);
 	kprobe_opcode_t *scan_end = NULL;
 	unsigned long size = 0, offset = 0;
 	struct arch_probe_insn *api = &asi->api;
diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h
index 8b758c5a2062..0e4195de8206 100644
--- a/arch/arm64/kernel/probes/decode-insn.h
+++ b/arch/arm64/kernel/probes/decode-insn.h
@@ -28,6 +28,6 @@ enum probe_insn __kprobes
 arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi);
 #endif
 enum probe_insn __kprobes
-arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi);
+arm_probe_decode_insn(u32 insn, struct arch_probe_insn *asi);
 
 #endif /* _ARM_KERNEL_KPROBES_ARM64_H */
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index a2f137a595fc..fa0b7941d204 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -34,7 +34,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
 int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
 		unsigned long addr)
 {
-	probe_opcode_t insn;
+	u32 insn;
 
 	/* TODO: Currently we do not support AARCH32 instruction probing */
 	if (mm->context.flags & MMCF_AARCH32)
@@ -102,7 +102,7 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)
 
 bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
 {
-	probe_opcode_t insn;
+	u32 insn;
 	unsigned long addr;
 
 	if (!auprobe->simulate)

From ab23df141f5318cf661504fb446159ce2dbd1ec2 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Oct 2024 13:39:15 +0100
Subject: [PATCH 043/168] arm64: asm-offsets: remove TSK_ACTIVE_MM

The TSK_ACTIVE_MM definition isn't used anywhere.

Remove it.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241007123921.549340-2-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/asm-offsets.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 27de1dddb0ab..42fd36027545 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -28,8 +28,6 @@
 
 int main(void)
 {
-  DEFINE(TSK_ACTIVE_MM,		offsetof(struct task_struct, active_mm));
-  BLANK();
   DEFINE(TSK_TI_CPU,		offsetof(struct task_struct, thread_info.cpu));
   DEFINE(TSK_TI_FLAGS,		offsetof(struct task_struct, thread_info.flags));
   DEFINE(TSK_TI_PREEMPT,	offsetof(struct task_struct, thread_info.preempt_count));

From 7bd8870af8dde527e0e8e83838de8966418c58f3 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Oct 2024 13:39:16 +0100
Subject: [PATCH 044/168] arm64: asm-offsets: remove VMA_VM_*

The VMA_VM_MM definition is only used by the vma_vm_mm macro, which
itself is unused. The VMA_VM_FLAGS definition isn't used anywhere.

Remove them all.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241007123921.549340-3-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/assembler.h | 7 -------
 arch/arm64/kernel/asm-offsets.c    | 3 ---
 2 files changed, 10 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index bc0b0d75acef..3d8d534a7a77 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -248,13 +248,6 @@ alternative_endif
 	ldr	\dst, [\dst, \tmp]
 	.endm
 
-/*
- * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
- */
-	.macro	vma_vm_mm, rd, rn
-	ldr	\rd, [\rn, #VMA_VM_MM]
-	.endm
-
 /*
  * read_ctr - read CTR_EL0. If the system has mismatched register fields,
  * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 42fd36027545..5f8b83915fac 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -104,9 +104,6 @@ int main(void)
 #endif
   DEFINE(MM_CONTEXT_ID,		offsetof(struct mm_struct, context.id.counter));
   BLANK();
-  DEFINE(VMA_VM_MM,		offsetof(struct vm_area_struct, vm_mm));
-  DEFINE(VMA_VM_FLAGS,		offsetof(struct vm_area_struct, vm_flags));
-  BLANK();
   DEFINE(VM_EXEC,	       	VM_EXEC);
   BLANK();
   DEFINE(PAGE_SZ,	       	PAGE_SIZE);

From 4c92c121c402cf3a9a60f58fbd2650577c68f1ac Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Oct 2024 13:39:17 +0100
Subject: [PATCH 045/168] arm64: asm-offsets: remove
 COMPAT_{RT_,SIGFRAME_REGS_OFFSET

The COMPAT_SIGFRAME_REGS_OFFSET and COMPAT_RT_SIGFRAME_REGS_OFFSET
defintions aren't used anywhere.

They were added in commit:

  f14d8025d263f3c8 ("arm64: compat: Generate asm offsets for signals")

... and subsequently their only user was removed in commit:

  2d071968a4052e58 ("arm64: compat: Remove 32-bit sigreturn code from the vDSO")

... leaving them unused.

Remove them.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241007123921.549340-4-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/asm-offsets.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 5f8b83915fac..520914b901a2 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -20,7 +20,6 @@
 #include <asm/fixmap.h>
 #include <asm/thread_info.h>
 #include <asm/memory.h>
-#include <asm/signal32.h>
 #include <asm/smp_plat.h>
 #include <asm/suspend.h>
 #include <linux/kbuild.h>
@@ -96,11 +95,6 @@ int main(void)
 #endif
   DEFINE(FREGS_SIZE,		sizeof(struct ftrace_regs));
   BLANK();
-#endif
-#ifdef CONFIG_COMPAT
-  DEFINE(COMPAT_SIGFRAME_REGS_OFFSET,		offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0));
-  DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET,	offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0));
-  BLANK();
 #endif
   DEFINE(MM_CONTEXT_ID,		offsetof(struct mm_struct, context.id.counter));
   BLANK();

From 1abc7c1e593337491d27b84a2a0b79c6d8164811 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Oct 2024 13:39:18 +0100
Subject: [PATCH 046/168] arm64: asm-offsets: remove MM_CONTEXT_ID

The only user of the MM_CONTEXT_ID defintion was removed in commit:

  25b92693a1b67a47 ("arm64: mm: convert cpu_do_switch_mm() to C")

Remove MM_CONTEXT_ID.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241007123921.549340-5-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/asm-offsets.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 520914b901a2..e6a6b5160f75 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -96,8 +96,6 @@ int main(void)
   DEFINE(FREGS_SIZE,		sizeof(struct ftrace_regs));
   BLANK();
 #endif
-  DEFINE(MM_CONTEXT_ID,		offsetof(struct mm_struct, context.id.counter));
-  BLANK();
   DEFINE(VM_EXEC,	       	VM_EXEC);
   BLANK();
   DEFINE(PAGE_SZ,	       	PAGE_SIZE);

From 4ce689b4480a528f8d088c71d26744944e360f76 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Oct 2024 13:39:19 +0100
Subject: [PATCH 047/168] arm64: asm-offsets: remove VM_EXEC and PAGE_SZ

The VM_EXEC definition duplicates the common VM_EXEC definition from
<linux/mm.h>. The common definition cannot safely be included by
assembly code but currently we don't need to use VM_EXEC in assembly.

The PAGE_SZ definition duplicates arm64's definition of PAGE_SIZE from
<asm/page-def.h> which can safely be included from assembly code and
should be used directly.

Remove the duplicate definitions.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241007123921.549340-6-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/asm-offsets.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index e6a6b5160f75..7c7c8d98256f 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -96,10 +96,6 @@ int main(void)
   DEFINE(FREGS_SIZE,		sizeof(struct ftrace_regs));
   BLANK();
 #endif
-  DEFINE(VM_EXEC,	       	VM_EXEC);
-  BLANK();
-  DEFINE(PAGE_SZ,	       	PAGE_SIZE);
-  BLANK();
   DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
   DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
   BLANK();

From b129125e1f963500fd1cbd19e4155a65ce922944 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Oct 2024 13:39:20 +0100
Subject: [PATCH 048/168] arm64: asm-offsets: remove DMA_{TO,FROM}_DEVICE

The DMA_TO_DEVICE and DMA_FROM_DEVICE defintitons in asm-offsets
duplicate the common defintions from <linux/dma-direction.h> (which used
to live in <linux/dma-mapping.h>), and haven't been used from asseembly
code since commit:

  7eacf1858bc86fe9 ("arm64: mm: Remove assembly DMA cache maintenance wrappers")

Remove them both.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241007123921.549340-7-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/asm-offsets.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 7c7c8d98256f..e5ffdc0163aa 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -12,7 +12,6 @@
 #include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/mm.h>
-#include <linux/dma-mapping.h>
 #include <linux/kvm_host.h>
 #include <linux/preempt.h>
 #include <linux/suspend.h>
@@ -96,9 +95,6 @@ int main(void)
   DEFINE(FREGS_SIZE,		sizeof(struct ftrace_regs));
   BLANK();
 #endif
-  DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
-  DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
-  BLANK();
   DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
   BLANK();
   DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));

From 7bb32dc788ddd0adae1273e799b920a790610fac Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 7 Oct 2024 13:39:21 +0100
Subject: [PATCH 049/168] arm64: asm-offsets: remove PREEMPT_DISABLE_OFFSET

The PREEMPT_DISABLE_OFFSET definition was added in commit:

  24534b3511828c66 ("arm64: assembler: add macros to conditionally yield the NEON under PREEMPT")

... but hasn't been used since commit:

  3931261ecf46151a ("arm64: fpsimd: Bring cond_yield asm macro in line with new rules")

Remove PREEMPT_DISABLE_OFFSET.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241007123921.549340-8-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/asm-offsets.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index e5ffdc0163aa..cb2e9567dca9 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -13,7 +13,6 @@
 #include <linux/kexec.h>
 #include <linux/mm.h>
 #include <linux/kvm_host.h>
-#include <linux/preempt.h>
 #include <linux/suspend.h>
 #include <asm/cpufeature.h>
 #include <asm/fixmap.h>
@@ -95,8 +94,6 @@ int main(void)
   DEFINE(FREGS_SIZE,		sizeof(struct ftrace_regs));
   BLANK();
 #endif
-  DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
-  BLANK();
   DEFINE(CPU_BOOT_TASK,		offsetof(struct secondary_data, task));
   BLANK();
   DEFINE(FTR_OVR_VAL_OFFSET,	offsetof(struct arm64_ftr_override, val));

From ac4ad5c09b34adb9c2937af874b63762fe66f725 Mon Sep 17 00:00:00 2001
From: Liao Chang <liaochang1@huawei.com>
Date: Mon, 9 Sep 2024 07:11:14 +0000
Subject: [PATCH 050/168] arm64: insn: Simulate nop instruction for better
 uprobe performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2->v1:
1. Remove the simuation of STP and the related bits.
2. Use arm64_skip_faulting_instruction for single-stepping or FEAT_BTI
   scenario.

As Andrii pointed out, the uprobe/uretprobe selftest bench run into a
counterintuitive result that nop and push variants are much slower than
ret variant [0]. The root cause lies in the arch_probe_analyse_insn(),
which excludes 'nop' and 'stp' from the emulatable instructions list.
This force the kernel returns to userspace and execute them out-of-line,
then trapping back to kernel for running uprobe callback functions. This
leads to a significant performance overhead compared to 'ret' variant,
which is already emulated.

Typicall uprobe is installed on 'nop' for USDT and on function entry
which starts with the instrucion 'stp x29, x30, [sp, #imm]!' to push lr
and fp into stack regardless kernel or userspace binary. In order to
improve the performance of handling uprobe for common usecases. This
patch supports the emulation of Arm64 equvialents instructions of 'nop'
and 'push'. The benchmark results below indicates the performance gain
of emulation is obvious.

On Kunpeng916 (Hi1616), 4 NUMA nodes, 64 Arm64 cores@2.4GHz.

xol (1 cpus)
------------
uprobe-nop:  0.916 ± 0.001M/s (0.916M/prod)
uprobe-push: 0.908 ± 0.001M/s (0.908M/prod)
uprobe-ret:  1.855 ± 0.000M/s (1.855M/prod)
uretprobe-nop:  0.640 ± 0.000M/s (0.640M/prod)
uretprobe-push: 0.633 ± 0.001M/s (0.633M/prod)
uretprobe-ret:  0.978 ± 0.003M/s (0.978M/prod)

emulation (1 cpus)
-------------------
uprobe-nop:  1.862 ± 0.002M/s  (1.862M/prod)
uprobe-push: 1.743 ± 0.006M/s  (1.743M/prod)
uprobe-ret:  1.840 ± 0.001M/s  (1.840M/prod)
uretprobe-nop:  0.964 ± 0.004M/s  (0.964M/prod)
uretprobe-push: 0.936 ± 0.004M/s  (0.936M/prod)
uretprobe-ret:  0.940 ± 0.001M/s  (0.940M/prod)

As shown above, the performance gap between 'nop/push' and 'ret'
variants has been significantly reduced. Due to the emulation of 'push'
instruction needs to access userspace memory, it spent more cycles than
the other.

As Mark suggested [1], it is painful to emulate the correct atomicity
and ordering properties of STP, especially when it interacts with MTE,
POE, etc. So this patch just focus on the simuation of 'nop'. The
simluation of STP and related changes will be addressed in a separate
patch.

[0] https://lore.kernel.org/all/CAEf4BzaO4eG6hr2hzXYpn+7Uer4chS0R99zLn02ezZ5YruVuQw@mail.gmail.com/
[1] https://lore.kernel.org/all/Zr3RN4zxF5XPgjEB@J2N7QTR9R3/

CC: Andrii Nakryiko <andrii.nakryiko@gmail.com>
CC: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Liao Chang <liaochang1@huawei.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20240909071114.1150053-1-liaochang1@huawei.com
[catalin.marinas@arm.com: small tweaks following MarkR's comments]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/insn.h            | 5 +++++
 arch/arm64/kernel/probes/decode-insn.c   | 9 +++++++++
 arch/arm64/kernel/probes/simulate-insn.c | 6 ++++++
 arch/arm64/kernel/probes/simulate-insn.h | 1 +
 4 files changed, 21 insertions(+)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 8c0a36f72d6f..89bc18989b90 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -575,6 +575,11 @@ static __always_inline u32 aarch64_insn_gen_nop(void)
 	return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
 }
 
+static __always_inline bool aarch64_insn_is_nop(u32 insn)
+{
+	return insn == aarch64_insn_gen_nop();
+}
+
 u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
 				enum aarch64_insn_branch_type type);
 u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index 41b100bcb041..e05249f57075 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -75,6 +75,15 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
 enum probe_insn __kprobes
 arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api)
 {
+	/*
+	 * While 'nop' instruction can execute in the out-of-line slot,
+	 * simulating them in breakpoint handling offers better performance.
+	 */
+	if (aarch64_insn_is_nop(insn)) {
+		api->handler = simulate_nop;
+		return INSN_GOOD_NO_SLOT;
+	}
+
 	/*
 	 * Instructions reading or modifying the PC won't work from the XOL
 	 * slot.
diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c
index b65334ab79d2..4c6d2d712fbd 100644
--- a/arch/arm64/kernel/probes/simulate-insn.c
+++ b/arch/arm64/kernel/probes/simulate-insn.c
@@ -196,3 +196,9 @@ simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs)
 
 	instruction_pointer_set(regs, instruction_pointer(regs) + 4);
 }
+
+void __kprobes
+simulate_nop(u32 opcode, long addr, struct pt_regs *regs)
+{
+	arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
+}
diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h
index e065dc92218e..efb2803ec943 100644
--- a/arch/arm64/kernel/probes/simulate-insn.h
+++ b/arch/arm64/kernel/probes/simulate-insn.h
@@ -16,5 +16,6 @@ void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs);
 void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs);
 void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs);
 void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_nop(u32 opcode, long addr, struct pt_regs *regs);
 
 #endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */

From 7ffc13e233951f15728c9d09db3cc8d9f6cf81f2 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 23 Sep 2024 21:13:50 +0800
Subject: [PATCH 051/168] arm64: tlbflush: add __flush_tlb_range_limit_excess()

The __flush_tlb_range_limit_excess() helper will be used when
flush tlb kernel range soon.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20240923131351.713304-2-wangkefeng.wang@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/tlbflush.h | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 95fbc8c05607..5f5e7d1f2e7d 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -431,6 +431,23 @@ do {									\
 #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
 	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
 
+static inline bool __flush_tlb_range_limit_excess(unsigned long start,
+		unsigned long end, unsigned long pages, unsigned long stride)
+{
+	/*
+	 * When the system does not support TLB range based flush
+	 * operation, (MAX_DVM_OPS - 1) pages can be handled. But
+	 * with TLB range based operation, MAX_TLBI_RANGE_PAGES
+	 * pages can be handled.
+	 */
+	if ((!system_supports_tlb_range() &&
+	     (end - start) >= (MAX_DVM_OPS * stride)) ||
+	    pages > MAX_TLBI_RANGE_PAGES)
+		return true;
+
+	return false;
+}
+
 static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
 				     unsigned long stride, bool last_level,
@@ -442,15 +459,7 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
 	end = round_up(end, stride);
 	pages = (end - start) >> PAGE_SHIFT;
 
-	/*
-	 * When not uses TLB range ops, we can handle up to
-	 * (MAX_DVM_OPS - 1) pages;
-	 * When uses TLB range ops, we can handle up to
-	 * MAX_TLBI_RANGE_PAGES pages.
-	 */
-	if ((!system_supports_tlb_range() &&
-	     (end - start) >= (MAX_DVM_OPS * stride)) ||
-	    pages > MAX_TLBI_RANGE_PAGES) {
+	if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
 		flush_tlb_mm(vma->vm_mm);
 		return;
 	}

From a923705c69f7f4ebe6a5488c1f556bed12d28031 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Mon, 23 Sep 2024 21:13:51 +0800
Subject: [PATCH 052/168] arm64: optimize flush tlb kernel range

Currently the kernel TLBs is flushed page by page if the target
VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
brutally issue a TLBI ALL.

But we could optimize it when CPU supports TLB range operations,
convert to use __flush_tlb_range_op() like other tlb range flush
to improve performance.

Co-developed-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20240923131351.713304-3-wangkefeng.wang@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/tlbflush.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 5f5e7d1f2e7d..bc94e036a26b 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -501,19 +501,21 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	unsigned long addr;
+	const unsigned long stride = PAGE_SIZE;
+	unsigned long pages;
 
-	if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
+	start = round_down(start, stride);
+	end = round_up(end, stride);
+	pages = (end - start) >> PAGE_SHIFT;
+
+	if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
 		flush_tlb_all();
 		return;
 	}
 
-	start = __TLBI_VADDR(start, 0);
-	end = __TLBI_VADDR(end, 0);
-
 	dsb(ishst);
-	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
-		__tlbi(vaale1is, addr);
+	__flush_tlb_range_op(vaale1is, start, pages, stride, 0,
+			     TLBI_TTL_UNKNOWN, false, lpa2_is_enabled());
 	dsb(ish);
 	isb();
 }

From 8ef41786d88fd429829bd143323168a9468e3be0 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 1 Oct 2024 10:28:04 +0530
Subject: [PATCH 053/168] arm64/mm: Change pgattr_change_is_safe() arguments as
 pteval_t

pgattr_change_is_safe() processes two distinct page table entries that just
happen to be 64 bits for all levels. This changes both arguments to reflect
the actual data type being processed in the function.

This change is important when moving to FEAT_D128 based 128 bit page tables
because it makes it simple to change the entry size in one place.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20241001045804.1119881-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 2 +-
 arch/arm64/mm/mmu.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c329ea061dc9..d56dbe5742d6 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -338,7 +338,7 @@ static inline pte_t __ptep_get(pte_t *ptep)
 }
 
 extern void __sync_icache_dcache(pte_t pteval);
-bool pgattr_change_is_safe(u64 old, u64 new);
+bool pgattr_change_is_safe(pteval_t old, pteval_t new);
 
 /*
  * PTE bits configuration in the presence of hardware Dirty Bit Management
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e55b02fbddc8..c1b2d0dc3078 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -119,7 +119,7 @@ static phys_addr_t __init early_pgtable_alloc(int shift)
 	return phys;
 }
 
-bool pgattr_change_is_safe(u64 old, u64 new)
+bool pgattr_change_is_safe(pteval_t old, pteval_t new)
 {
 	/*
 	 * The following mapping attributes may be updated in live

From 25c17c4b55def92a01e3eecc9c775a6ee25ca20f Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Tue, 1 Oct 2024 15:52:19 -0700
Subject: [PATCH 054/168] hugetlb: arm64: add mte support

Enable MTE support for hugetlb.

The MTE page flags will be set on the folio only.  When copying
hugetlb folio (for example, CoW), the tags for all subpages will be copied
when copying the first subpage.

When freeing hugetlb folio, the MTE flags will be cleared.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20241001225220.271178-1-yang@os.amperecomputing.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/hugetlb.h |  8 ++++
 arch/arm64/include/asm/mman.h    |  3 +-
 arch/arm64/include/asm/mte.h     | 67 ++++++++++++++++++++++++++++++++
 arch/arm64/kernel/hibernate.c    |  6 +++
 arch/arm64/kernel/mte.c          | 27 ++++++++++++-
 arch/arm64/kvm/guest.c           | 16 ++++++--
 arch/arm64/kvm/mmu.c             | 11 ++++++
 arch/arm64/mm/copypage.c         | 27 ++++++++++++-
 fs/hugetlbfs/inode.c             |  2 +-
 9 files changed, 159 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 293f880865e8..c6dff3e69539 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -11,6 +11,7 @@
 #define __ASM_HUGETLB_H
 
 #include <asm/cacheflush.h>
+#include <asm/mte.h>
 #include <asm/page.h>
 
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
@@ -21,6 +22,13 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h);
 static inline void arch_clear_hugetlb_flags(struct folio *folio)
 {
 	clear_bit(PG_dcache_clean, &folio->flags);
+
+#ifdef CONFIG_ARM64_MTE
+	if (system_supports_mte()) {
+		clear_bit(PG_mte_tagged, &folio->flags);
+		clear_bit(PG_mte_lock, &folio->flags);
+	}
+#endif
 }
 #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
 
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 9e39217b4afb..65bc2b07f666 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -38,7 +38,8 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
 	 * backed by tags-capable memory. The vm_flags may be overridden by a
 	 * filesystem supporting MTE (RAM-based).
 	 */
-	if (system_supports_mte() && (flags & MAP_ANONYMOUS))
+	if (system_supports_mte() &&
+	    (flags & (MAP_ANONYMOUS | MAP_HUGETLB)))
 		return VM_MTE_ALLOWED;
 
 	return 0;
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 0f84518632b4..6567df8ec8ca 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -41,6 +41,8 @@ void mte_free_tag_storage(char *storage);
 
 static inline void set_page_mte_tagged(struct page *page)
 {
+	VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
+
 	/*
 	 * Ensure that the tags written prior to this function are visible
 	 * before the page flags update.
@@ -53,6 +55,8 @@ static inline bool page_mte_tagged(struct page *page)
 {
 	bool ret = test_bit(PG_mte_tagged, &page->flags);
 
+	VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
+
 	/*
 	 * If the page is tagged, ensure ordering with a likely subsequent
 	 * read of the tags.
@@ -76,6 +80,8 @@ static inline bool page_mte_tagged(struct page *page)
  */
 static inline bool try_page_mte_tagging(struct page *page)
 {
+	VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
+
 	if (!test_and_set_bit(PG_mte_lock, &page->flags))
 		return true;
 
@@ -157,6 +163,67 @@ static inline int mte_ptrace_copy_tags(struct task_struct *child,
 
 #endif /* CONFIG_ARM64_MTE */
 
+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE)
+static inline void folio_set_hugetlb_mte_tagged(struct folio *folio)
+{
+	VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
+
+	/*
+	 * Ensure that the tags written prior to this function are visible
+	 * before the folio flags update.
+	 */
+	smp_wmb();
+	set_bit(PG_mte_tagged, &folio->flags);
+
+}
+
+static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
+{
+	bool ret = test_bit(PG_mte_tagged, &folio->flags);
+
+	VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
+
+	/*
+	 * If the folio is tagged, ensure ordering with a likely subsequent
+	 * read of the tags.
+	 */
+	if (ret)
+		smp_rmb();
+	return ret;
+}
+
+static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio)
+{
+	VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
+
+	if (!test_and_set_bit(PG_mte_lock, &folio->flags))
+		return true;
+
+	/*
+	 * The tags are either being initialised or may have been initialised
+	 * already. Check if the PG_mte_tagged flag has been set or wait
+	 * otherwise.
+	 */
+	smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged));
+
+	return false;
+}
+#else
+static inline void folio_set_hugetlb_mte_tagged(struct folio *folio)
+{
+}
+
+static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
+{
+	return false;
+}
+
+static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio)
+{
+	return false;
+}
+#endif
+
 static inline void mte_disable_tco_entry(struct task_struct *task)
 {
 	if (!system_supports_mte())
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 7b11d84f533c..18749e9a6c2d 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -266,9 +266,15 @@ static int swsusp_mte_save_tags(void)
 		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
 			struct page *page = pfn_to_online_page(pfn);
+			struct folio *folio;
 
 			if (!page)
 				continue;
+			folio = page_folio(page);
+
+			if (folio_test_hugetlb(folio) &&
+			    !folio_test_hugetlb_mte_tagged(folio))
+				continue;
 
 			if (!page_mte_tagged(page))
 				continue;
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 6174671be7c1..2fbfd27ff5f2 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -38,7 +38,24 @@ EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode);
 void mte_sync_tags(pte_t pte, unsigned int nr_pages)
 {
 	struct page *page = pte_page(pte);
-	unsigned int i;
+	struct folio *folio = page_folio(page);
+	unsigned long i;
+
+	if (folio_test_hugetlb(folio)) {
+		unsigned long nr = folio_nr_pages(folio);
+
+		/* Hugetlb MTE flags are set for head page only */
+		if (folio_try_hugetlb_mte_tagging(folio)) {
+			for (i = 0; i < nr; i++, page++)
+				mte_clear_page_tags(page_address(page));
+			folio_set_hugetlb_mte_tagged(folio);
+		}
+
+		/* ensure the tags are visible before the PTE is set */
+		smp_wmb();
+
+		return;
+	}
 
 	/* if PG_mte_tagged is set, tags have already been initialised */
 	for (i = 0; i < nr_pages; i++, page++) {
@@ -410,6 +427,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
 		void *maddr;
 		struct page *page = get_user_page_vma_remote(mm, addr,
 							     gup_flags, &vma);
+		struct folio *folio;
 
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
@@ -428,7 +446,12 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
 			put_page(page);
 			break;
 		}
-		WARN_ON_ONCE(!page_mte_tagged(page));
+
+		folio = page_folio(page);
+		if (folio_test_hugetlb(folio))
+			WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio));
+		else
+			WARN_ON_ONCE(!page_mte_tagged(page));
 
 		/* limit access to the end of the page */
 		offset = offset_in_page(addr);
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 962f985977c2..e738a353b20e 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1055,6 +1055,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 		void *maddr;
 		unsigned long num_tags;
 		struct page *page;
+		struct folio *folio;
 
 		if (is_error_noslot_pfn(pfn)) {
 			ret = -EFAULT;
@@ -1068,10 +1069,13 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 			ret = -EFAULT;
 			goto out;
 		}
+		folio = page_folio(page);
 		maddr = page_address(page);
 
 		if (!write) {
-			if (page_mte_tagged(page))
+			if ((folio_test_hugetlb(folio) &&
+			     folio_test_hugetlb_mte_tagged(folio)) ||
+			     page_mte_tagged(page))
 				num_tags = mte_copy_tags_to_user(tags, maddr,
 							MTE_GRANULES_PER_PAGE);
 			else
@@ -1085,14 +1089,20 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 			 * __set_ptes() in the VMM but still overriding the
 			 * tags, hence ignoring the return value.
 			 */
-			try_page_mte_tagging(page);
+			if (folio_test_hugetlb(folio))
+				folio_try_hugetlb_mte_tagging(folio);
+			else
+				try_page_mte_tagging(page);
 			num_tags = mte_copy_tags_from_user(maddr, tags,
 							MTE_GRANULES_PER_PAGE);
 
 			/* uaccess failed, don't leave stale tags */
 			if (num_tags != MTE_GRANULES_PER_PAGE)
 				mte_clear_page_tags(maddr);
-			set_page_mte_tagged(page);
+			if (folio_test_hugetlb(folio))
+				folio_set_hugetlb_mte_tagged(folio);
+			else
+				set_page_mte_tagged(page);
 
 			kvm_release_pfn_dirty(pfn);
 		}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a509b63bd4dd..962449f9ac2f 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1401,10 +1401,21 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
 {
 	unsigned long i, nr_pages = size >> PAGE_SHIFT;
 	struct page *page = pfn_to_page(pfn);
+	struct folio *folio = page_folio(page);
 
 	if (!kvm_has_mte(kvm))
 		return;
 
+	if (folio_test_hugetlb(folio)) {
+		/* Hugetlb has MTE flags set on head page only */
+		if (folio_try_hugetlb_mte_tagging(folio)) {
+			for (i = 0; i < nr_pages; i++, page++)
+				mte_clear_page_tags(page_address(page));
+			folio_set_hugetlb_mte_tagged(folio);
+		}
+		return;
+	}
+
 	for (i = 0; i < nr_pages; i++, page++) {
 		if (try_page_mte_tagging(page)) {
 			mte_clear_page_tags(page_address(page));
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index a7bb20055ce0..87b3f1a25535 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -18,15 +18,40 @@ void copy_highpage(struct page *to, struct page *from)
 {
 	void *kto = page_address(to);
 	void *kfrom = page_address(from);
+	struct folio *src = page_folio(from);
+	struct folio *dst = page_folio(to);
+	unsigned int i, nr_pages;
 
 	copy_page(kto, kfrom);
 
 	if (kasan_hw_tags_enabled())
 		page_kasan_tag_reset(to);
 
-	if (system_supports_mte() && page_mte_tagged(from)) {
+	if (!system_supports_mte())
+		return;
+
+	if (folio_test_hugetlb(src) &&
+	    folio_test_hugetlb_mte_tagged(src)) {
+		if (!folio_try_hugetlb_mte_tagging(dst))
+			return;
+
+		/*
+		 * Populate tags for all subpages.
+		 *
+		 * Don't assume the first page is head page since
+		 * huge page copy may start from any subpage.
+		 */
+		nr_pages = folio_nr_pages(src);
+		for (i = 0; i < nr_pages; i++) {
+			kfrom = page_address(folio_page(src, i));
+			kto = page_address(folio_page(dst, i));
+			mte_copy_page_tags(kto, kfrom);
+		}
+		folio_set_hugetlb_mte_tagged(dst);
+	} else if (page_mte_tagged(from)) {
 		/* It's a new page, shouldn't have been tagged yet */
 		WARN_ON_ONCE(!try_page_mte_tagging(to));
+
 		mte_copy_page_tags(kto, kfrom);
 		set_page_mte_tagged(to);
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5cf327337e22..f26b3b53d7de 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	 * way when do_mmap unwinds (may be important on powerpc
 	 * and ia64).
 	 */
-	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
+	vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED);
 	vma->vm_ops = &hugetlb_vm_ops;
 
 	ret = seal_check_write(info->seals, vma);

From 27879e8cb6b0fdb5cdcd76685f290729309711c6 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang@os.amperecomputing.com>
Date: Tue, 1 Oct 2024 15:52:20 -0700
Subject: [PATCH 055/168] selftests: arm64: add hugetlb mte tests

The tests cover mmap, mprotect hugetlb with MTE prot and COW.

Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20241001225220.271178-2-yang@os.amperecomputing.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../arm64/mte/check_hugetlb_options.c         | 285 ++++++++++++++++++
 1 file changed, 285 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/mte/check_hugetlb_options.c

diff --git a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c
new file mode 100644
index 000000000000..303260a6dc65
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2024 Ampere Computing LLC
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define TAG_CHECK_ON		0
+#define TAG_CHECK_OFF		1
+
+static unsigned long default_huge_page_size(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return 0;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+	return hps;
+}
+
+static bool is_hugetlb_allocated(void)
+{
+	unsigned long hps = 0;
+	char *line = NULL;
+	size_t linelen = 0;
+	FILE *f = fopen("/proc/meminfo", "r");
+
+	if (!f)
+		return false;
+	while (getline(&line, &linelen, f) > 0) {
+		if (sscanf(line, "Hugetlb:       %lu kB", &hps) == 1) {
+			hps <<= 10;
+			break;
+		}
+	}
+
+	free(line);
+	fclose(f);
+
+	if (hps > 0)
+		return true;
+
+	return false;
+}
+
+static void write_sysfs(char *str, unsigned long val)
+{
+	FILE *f;
+
+	f = fopen(str, "w");
+	if (!f) {
+		ksft_print_msg("ERR: missing %s\n", str);
+		return;
+	}
+	fprintf(f, "%lu", val);
+	fclose(f);
+}
+
+static void allocate_hugetlb()
+{
+	write_sysfs("/proc/sys/vm/nr_hugepages", 2);
+}
+
+static void free_hugetlb()
+{
+	write_sysfs("/proc/sys/vm/nr_hugepages", 0);
+}
+
+static int check_child_tag_inheritance(char *ptr, int size, int mode)
+{
+	int i, parent_tag, child_tag, fault, child_status;
+	pid_t child;
+
+	parent_tag = MT_FETCH_TAG((uintptr_t)ptr);
+	fault = 0;
+
+	child = fork();
+	if (child == -1) {
+		ksft_print_msg("FAIL: child process creation\n");
+		return KSFT_FAIL;
+	} else if (child == 0) {
+		mte_initialize_current_context(mode, (uintptr_t)ptr, size);
+		/* Do copy on write */
+		memset(ptr, '1', size);
+		mte_wait_after_trig();
+		if (cur_mte_cxt.fault_valid == true) {
+			fault = 1;
+			goto check_child_tag_inheritance_err;
+		}
+		for (i = 0; i < size; i += MT_GRANULE_SIZE) {
+			child_tag = MT_FETCH_TAG((uintptr_t)(mte_get_tag_address(ptr + i)));
+			if (parent_tag != child_tag) {
+				ksft_print_msg("FAIL: child mte tag (%d) mismatch\n", i);
+				fault = 1;
+				goto check_child_tag_inheritance_err;
+			}
+		}
+check_child_tag_inheritance_err:
+		_exit(fault);
+	}
+	/* Wait for child process to terminate */
+	wait(&child_status);
+	if (WIFEXITED(child_status))
+		fault = WEXITSTATUS(child_status);
+	else
+		fault = 1;
+	return (fault) ? KSFT_FAIL : KSFT_PASS;
+}
+
+static int check_mte_memory(char *ptr, int size, int mode, int tag_check)
+{
+	mte_initialize_current_context(mode, (uintptr_t)ptr, size);
+	memset(ptr, '1', size);
+	mte_wait_after_trig();
+	if (cur_mte_cxt.fault_valid == true)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+static int check_hugetlb_memory_mapping(int mem_type, int mode, int mapping, int tag_check)
+{
+	char *ptr, *map_ptr;
+	int result;
+	unsigned long map_size;
+
+	map_size = default_huge_page_size();
+
+	mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+	map_ptr = (char *)mte_allocate_memory(map_size, mem_type, mapping, false);
+	if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS)
+		return KSFT_FAIL;
+
+	mte_initialize_current_context(mode, (uintptr_t)map_ptr, map_size);
+	/* Only mte enabled memory will allow tag insertion */
+	ptr = mte_insert_tags((void *)map_ptr, map_size);
+	if (!ptr || cur_mte_cxt.fault_valid == true) {
+		ksft_print_msg("FAIL: Insert tags on anonymous mmap memory\n");
+		munmap((void *)map_ptr, map_size);
+		return KSFT_FAIL;
+	}
+	result = check_mte_memory(ptr, map_size, mode, tag_check);
+	mte_clear_tags((void *)ptr, map_size);
+	mte_free_memory((void *)map_ptr, map_size, mem_type, false);
+	if (result == KSFT_FAIL)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+static int check_clear_prot_mte_flag(int mem_type, int mode, int mapping)
+{
+	char *map_ptr;
+	int prot_flag, result;
+	unsigned long map_size;
+
+	prot_flag = PROT_READ | PROT_WRITE;
+	mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+	map_size = default_huge_page_size();
+	map_ptr = (char *)mte_allocate_memory_tag_range(map_size, mem_type, mapping,
+							0, 0);
+	if (check_allocated_memory_range(map_ptr, map_size, mem_type,
+					 0, 0) != KSFT_PASS)
+		return KSFT_FAIL;
+	/* Try to clear PROT_MTE property and verify it by tag checking */
+	if (mprotect(map_ptr, map_size, prot_flag)) {
+		mte_free_memory_tag_range((void *)map_ptr, map_size, mem_type,
+					  0, 0);
+		ksft_print_msg("FAIL: mprotect not ignoring clear PROT_MTE property\n");
+		return KSFT_FAIL;
+	}
+	result = check_mte_memory(map_ptr, map_size, mode, TAG_CHECK_ON);
+	mte_free_memory_tag_range((void *)map_ptr, map_size, mem_type, 0, 0);
+	if (result != KSFT_PASS)
+		return KSFT_FAIL;
+
+	return KSFT_PASS;
+}
+
+static int check_child_hugetlb_memory_mapping(int mem_type, int mode, int mapping)
+{
+	char *ptr;
+	int result;
+	unsigned long map_size;
+
+	map_size = default_huge_page_size();
+
+	mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+	ptr = (char *)mte_allocate_memory_tag_range(map_size, mem_type, mapping,
+						    0, 0);
+	if (check_allocated_memory_range(ptr, map_size, mem_type,
+					 0, 0) != KSFT_PASS)
+		return KSFT_FAIL;
+	result = check_child_tag_inheritance(ptr, map_size, mode);
+	mte_free_memory_tag_range((void *)ptr, map_size, mem_type, 0, 0);
+	if (result == KSFT_FAIL)
+		return result;
+
+	return KSFT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+	int err;
+
+	err = mte_default_setup();
+	if (err)
+		return err;
+
+	/* Register signal handlers */
+	mte_register_signal(SIGBUS, mte_default_handler);
+	mte_register_signal(SIGSEGV, mte_default_handler);
+
+	allocate_hugetlb();
+
+	if (!is_hugetlb_allocated()) {
+		ksft_print_msg("ERR: Unable allocate hugetlb pages\n");
+		return KSFT_FAIL;
+	}
+
+	/* Set test plan */
+	ksft_set_plan(12);
+
+	mte_enable_pstate_tco();
+
+	evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_OFF),
+	"Check hugetlb memory with private mapping, sync error mode, mmap memory and tag check off\n");
+
+	mte_disable_pstate_tco();
+	evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_NONE_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_OFF),
+	"Check hugetlb memory with private mapping, no error mode, mmap memory and tag check off\n");
+
+	evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+	"Check hugetlb memory with private mapping, sync error mode, mmap memory and tag check on\n");
+	evaluate_test(check_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+	"Check hugetlb memory with private mapping, sync error mode, mmap/mprotect memory and tag check on\n");
+	evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+	"Check hugetlb memory with private mapping, async error mode, mmap memory and tag check on\n");
+	evaluate_test(check_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+	"Check hugetlb memory with private mapping, async error mode, mmap/mprotect memory and tag check on\n");
+
+	evaluate_test(check_clear_prot_mte_flag(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+	"Check clear PROT_MTE flags with private mapping, sync error mode and mmap memory\n");
+	evaluate_test(check_clear_prot_mte_flag(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+	"Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n");
+
+	evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+		"Check child hugetlb memory with private mapping, precise mode and mmap memory\n");
+	evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+		"Check child hugetlb memory with private mapping, precise mode and mmap memory\n");
+	evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+		"Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n");
+	evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+		"Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n");
+
+	mte_restore_setup();
+	free_hugetlb();
+	ksft_print_cnts();
+	return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}

From 48f8d9cef766f8ed4bbccc0d759710262d34f40b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Sat, 5 Oct 2024 01:17:18 +0100
Subject: [PATCH 056/168] kselftest/arm64: Validate that GCS push and write
 permissions work

Add trivial assembly programs which give themselves the appropriate
permissions and then execute GCSPUSHM and GCSSTR, they will report errors
by generating signals on the non-permitted instructions. Not using libc
minimises the interaction with any policy set for the system but we skip on
failure to get the permissions in case the system is locked down to make
them inaccessible.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241005-arm64-gcs-test-flags-v1-1-03cb9786c5cd@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/.gitignore |  2 +
 tools/testing/selftests/arm64/gcs/Makefile   |  8 +-
 tools/testing/selftests/arm64/gcs/gcspushm.S | 96 +++++++++++++++++++
 tools/testing/selftests/arm64/gcs/gcsstr.S   | 99 ++++++++++++++++++++
 4 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/gcs/gcspushm.S
 create mode 100644 tools/testing/selftests/arm64/gcs/gcsstr.S

diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
index 1e8d1f6b27f2..bbb8e40a7e52 100644
--- a/tools/testing/selftests/arm64/gcs/.gitignore
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -3,3 +3,5 @@ libc-gcs
 gcs-locking
 gcs-stress
 gcs-stress-thread
+gcspushm
+gcsstr
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
index d8b06ca51e22..d2f3497a9103 100644
--- a/tools/testing/selftests/arm64/gcs/Makefile
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -6,7 +6,7 @@
 # nolibc.
 #
 
-TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress gcspushm gcsstr
 TEST_GEN_PROGS_EXTENDED := gcs-stress-thread
 
 LDLIBS+=-lpthread
@@ -22,3 +22,9 @@ $(OUTPUT)/basic-gcs: basic-gcs.c
 
 $(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S
 	$(CC) -nostdlib $^ -o $@
+
+$(OUTPUT)/gcspushm: gcspushm.S
+	$(CC) -nostdlib $^ -o $@
+
+$(OUTPUT)/gcsstr: gcsstr.S
+	$(CC) -nostdlib $^ -o $@
diff --git a/tools/testing/selftests/arm64/gcs/gcspushm.S b/tools/testing/selftests/arm64/gcs/gcspushm.S
new file mode 100644
index 000000000000..bbe17c1325ac
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcspushm.S
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright 2024 Arm Limited
+//
+// Give ourselves GCS push permissions then use them
+
+#include <asm/unistd.h>
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS	74
+#define PR_SET_SHADOW_STACK_STATUS      75
+#define PR_LOCK_SHADOW_STACK_STATUS     76
+
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+#define KSFT_SKIP 4
+
+.macro function name
+	.macro endfunction
+		.type \name, @function
+		.purgem endfunction
+	.endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+	str	x0, [sp, #-16]!
+
+	mov	x0, #1			// STDOUT_FILENO
+	mov	x1, sp
+	mov	x2, #1
+	mov	x8, #__NR_write
+	svc	#0
+
+	add	sp, sp, #16
+	ret
+endfunction
+.globl	putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+	mov	x1, x0
+
+	mov	x2, #0
+0:	ldrb	w3, [x0], #1
+	cbz	w3, 1f
+	add	x2, x2, #1
+	b	0b
+
+1:	mov	w0, #1			// STDOUT_FILENO
+	mov	x8, #__NR_write
+	svc	#0
+
+	ret
+endfunction
+.globl	puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+	.pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+	.popsection
+
+	ldr	x0, =.L__puts_literal\@
+	bl	puts
+.endm
+
+.globl _start
+function _start
+	// Run with GCS
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+	cbz	x0, 1f
+	puts	"Failed to enable GCS with push permission\n"
+	mov	x0, #KSFT_SKIP
+	b	2f
+1:
+	sys	#3, c7, c7, #0, x0	// GCSPUSHM
+	sysl	x0, #3, c7, c7, #1	// GCSPOPM
+
+	mov	x0, #0
+2:
+	mov	x8, #__NR_exit
+	svc	#0
diff --git a/tools/testing/selftests/arm64/gcs/gcsstr.S b/tools/testing/selftests/arm64/gcs/gcsstr.S
new file mode 100644
index 000000000000..a42bba6e30b1
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcsstr.S
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright 2024 Arm Limited
+//
+// Give ourselves GCS write permissions then use them
+
+#include <asm/unistd.h>
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS	74
+#define PR_SET_SHADOW_STACK_STATUS      75
+#define PR_LOCK_SHADOW_STACK_STATUS     76
+
+# define PR_SHADOW_STACK_ENABLE         (1UL << 0)
+# define PR_SHADOW_STACK_WRITE		(1UL << 1)
+# define PR_SHADOW_STACK_PUSH		(1UL << 2)
+
+#define	GCSPR_EL0 S3_3_C2_C5_1
+
+#define KSFT_SKIP 4
+
+.macro function name
+	.macro endfunction
+		.type \name, @function
+		.purgem endfunction
+	.endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+	str	x0, [sp, #-16]!
+
+	mov	x0, #1			// STDOUT_FILENO
+	mov	x1, sp
+	mov	x2, #1
+	mov	x8, #__NR_write
+	svc	#0
+
+	add	sp, sp, #16
+	ret
+endfunction
+.globl	putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+	mov	x1, x0
+
+	mov	x2, #0
+0:	ldrb	w3, [x0], #1
+	cbz	w3, 1f
+	add	x2, x2, #1
+	b	0b
+
+1:	mov	w0, #1			// STDOUT_FILENO
+	mov	x8, #__NR_write
+	svc	#0
+
+	ret
+endfunction
+.globl	puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+	.pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+	.popsection
+
+	ldr	x0, =.L__puts_literal\@
+	bl	puts
+.endm
+
+.globl _start
+function _start
+	// Run with GCS
+	mov	x0, PR_SET_SHADOW_STACK_STATUS
+	mov	x1, PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE
+	mov	x2, xzr
+	mov	x3, xzr
+	mov	x4, xzr
+	mov	x5, xzr
+	mov	x8, #__NR_prctl
+	svc	#0
+	cbz	x0, 1f
+	puts	"Failed to enable GCS with write permission\n"
+	mov	x0, #KSFT_SKIP
+	b	2f
+1:
+	mrs	x0, GCSPR_EL0
+	sub	x0, x0, #8
+	.inst	0xd91f1c01	// GCSSTR x1, x0
+
+	mov	x0, #0
+2:
+	mov	x8, #__NR_exit
+	svc	#0

From 9c4a25140dee5356254ddb921086c49fb93ba952 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Tue, 8 Oct 2024 15:01:21 +0100
Subject: [PATCH 057/168] arm64: cpufeature: add POE to cpucap_is_possible()

Since de66cb37ab6 ("arm64: Add cpucap_is_possible()"),
alternative_has_cap_unlikely() includes the IS_ENABLED() check.

Add CONFIG_ARM64_POE to cpucap_is_possible() to avoid the explicit check.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Will Deacon <will@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241008140121.2774348-1-joey.gouly@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h    | 2 ++
 arch/arm64/include/asm/cpufeature.h | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index a6e5b07b64fd..a08a1212ffbb 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -42,6 +42,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_ARM64_BTI);
 	case ARM64_HAS_TLB_RANGE:
 		return IS_ENABLED(CONFIG_ARM64_TLB_RANGE);
+	case ARM64_HAS_S1POE:
+		return IS_ENABLED(CONFIG_ARM64_POE);
 	case ARM64_UNMAP_KERNEL_AT_EL0:
 		return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
 	case ARM64_WORKAROUND_843419:
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 3d261cc123c1..4b6bc0bac9b9 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -834,8 +834,7 @@ static inline bool system_supports_lpa2(void)
 
 static inline bool system_supports_poe(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_POE) &&
-		alternative_has_cap_unlikely(ARM64_HAS_S1POE);
+	return alternative_has_cap_unlikely(ARM64_HAS_S1POE);
 }
 
 int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);

From 9b9be78258511e67767e4aa51f587cf22feb5065 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 11 Oct 2024 15:36:25 +0100
Subject: [PATCH 058/168] kselftest/arm64: Ensure stable names for GCS stress
 test results

The GCS stress test program currently uses the PID of the threads it
creates in the test names it reports, resulting in unstable test names
between runs. Fix this by using a thread number instead.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241011-arm64-gcs-stress-stable-name-v1-1-4950f226218e@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/gcs-stress.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c
index bdec7ee8cfd5..03222c36c436 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-stress.c
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c
@@ -56,7 +56,7 @@ static int num_processors(void)
 	return nproc;
 }
 
-static void start_thread(struct child_data *child)
+static void start_thread(struct child_data *child, int id)
 {
 	int ret, pipefd[2], i;
 	struct epoll_event ev;
@@ -132,7 +132,7 @@ static void start_thread(struct child_data *child)
 		ev.events = EPOLLIN | EPOLLHUP;
 		ev.data.ptr = child;
 
-		ret = asprintf(&child->name, "Thread-%d", child->pid);
+		ret = asprintf(&child->name, "Thread-%d", id);
 		if (ret == -1)
 			ksft_exit_fail_msg("asprintf() failed\n");
 
@@ -437,7 +437,7 @@ int main(int argc, char **argv)
 				   tests);
 
 	for (i = 0; i < gcs_threads; i++)
-		start_thread(&children[i]);
+		start_thread(&children[i], i);
 
 	/*
 	 * All children started, close the startup pipe and let them

From 0349934618907a0bfc9088282c526c2852d4ccaa Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 11 Oct 2024 11:30:25 +0100
Subject: [PATCH 059/168] arm64/sysreg: Update ID_AA64MMFR1_EL1 to DDI0601
 2024-09

ID_AA64MMFR1_EL1 has been updated by the architecture to enumerate several
new architectural features since the last time sysreg was updated, sync
with the definnition in DD0601 2024-09 to include two new versions of each
of ETS and HAFDBS.

Reported-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241011-arm64-aa64mmfr1-2024-09-v1-1-61935a085010@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/tools/sysreg | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 8d637ac4b7c6..ae64ba810298 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1648,6 +1648,8 @@ EndEnum
 UnsignedEnum	39:36	ETS
 	0b0000	NI
 	0b0001	IMP
+	0b0010	ETS2
+	0b0011	ETS3
 EndEnum
 UnsignedEnum	35:32	TWED
 	0b0000	NI
@@ -1688,6 +1690,8 @@ UnsignedEnum	3:0	HAFDBS
 	0b0000	NI
 	0b0001	AF
 	0b0010	DBM
+	0b0011	HAFT
+	0b0100	HDBSS
 EndEnum
 EndSysreg
 

From 0f612c6eb13ad85a60e00c751c83e24f4a32cd0a Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Mon, 14 Oct 2024 13:03:41 +1000
Subject: [PATCH 060/168] arm64: head: Drop SWAPPER_TABLE_SHIFT

There is no users of SWAPPER_TABLE_SHIFT after commit 84b04d3e6bdb
("arm64: kernel: Create initial ID map from C code"). Just drop it.

Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20241014030341.995806-1-gshan@redhat.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kernel-pgtable.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index bf05a77873a4..fd5a08450b12 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -26,7 +26,6 @@
 #define SWAPPER_SKIP_LEVEL	0
 #endif
 #define SWAPPER_BLOCK_SIZE	(UL(1) << SWAPPER_BLOCK_SHIFT)
-#define SWAPPER_TABLE_SHIFT	(SWAPPER_BLOCK_SHIFT + PAGE_SHIFT - 3)
 
 #define SWAPPER_PGTABLE_LEVELS		(CONFIG_PGTABLE_LEVELS - SWAPPER_SKIP_LEVEL)
 #define INIT_IDMAP_PGTABLE_LEVELS	(IDMAP_LEVELS - SWAPPER_SKIP_LEVEL)

From c56c599d9002d44f559be3852b371db46adac87c Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 30 Sep 2024 17:10:47 +0100
Subject: [PATCH 061/168] arm64: probes: Disable kprobes/uprobes on MOPS
 instructions

FEAT_MOPS instructions require that all three instructions (prologue,
main and epilogue) appear consecutively in memory. Placing a
kprobe/uprobe on one of them doesn't work as only a single instruction
gets executed out-of-line or simulated. So don't allow placing a probe
on a MOPS instruction.

Fixes: b7564127ffcb ("arm64: mops: detect and enable FEAT_MOPS")
Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Link: https://lore.kernel.org/r/20240930161051.3777828-2-kristina.martsenko@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/insn.h          | 1 +
 arch/arm64/kernel/probes/decode-insn.c | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 8c0a36f72d6f..bc77869dbd43 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -353,6 +353,7 @@ __AARCH64_INSN_FUNCS(ldrsw_lit,	0xFF000000, 0x98000000)
 __AARCH64_INSN_FUNCS(exclusive,	0x3F800000, 0x08000000)
 __AARCH64_INSN_FUNCS(load_ex,	0x3F400000, 0x08400000)
 __AARCH64_INSN_FUNCS(store_ex,	0x3F400000, 0x08000000)
+__AARCH64_INSN_FUNCS(mops,	0x3B200C00, 0x19000400)
 __AARCH64_INSN_FUNCS(stp,	0x7FC00000, 0x29000000)
 __AARCH64_INSN_FUNCS(ldp,	0x7FC00000, 0x29400000)
 __AARCH64_INSN_FUNCS(stp_post,	0x7FC00000, 0x28800000)
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index 968d5fffe233..77f3c8eb0916 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -58,10 +58,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
 	 * Instructions which load PC relative literals are not going to work
 	 * when executed from an XOL slot. Instructions doing an exclusive
 	 * load/store are not going to complete successfully when single-step
-	 * exception handling happens in the middle of the sequence.
+	 * exception handling happens in the middle of the sequence. Memory
+	 * copy/set instructions require that all three instructions be placed
+	 * consecutively in memory.
 	 */
 	if (aarch64_insn_uses_literal(insn) ||
-	    aarch64_insn_is_exclusive(insn))
+	    aarch64_insn_is_exclusive(insn) ||
+	    aarch64_insn_is_mops(insn))
 		return false;
 
 	return true;

From 13840229d6bd5c191a9ca68ceba0af0fa03d7645 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 30 Sep 2024 17:10:48 +0100
Subject: [PATCH 062/168] arm64: mops: Handle MOPS exceptions from EL1

We will soon be using MOPS instructions in the kernel, so wire up the
exception handler to handle exceptions from EL1 caused by the copy/set
operation being stopped and resumed on a different type of CPU.

Add a helper for advancing the single step state machine, similarly to
what the EL0 exception handler does.

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Link: https://lore.kernel.org/r/20240930161051.3777828-3-kristina.martsenko@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/debug-monitors.h |  1 +
 arch/arm64/include/asm/exception.h      |  1 +
 arch/arm64/kernel/debug-monitors.c      |  5 +++++
 arch/arm64/kernel/entry-common.c        | 12 ++++++++++++
 arch/arm64/kernel/traps.c               |  7 +++++++
 5 files changed, 26 insertions(+)

diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index 13d437bcbf58..8f6ba31b8658 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -105,6 +105,7 @@ void kernel_enable_single_step(struct pt_regs *regs);
 void kernel_disable_single_step(void);
 int kernel_active_single_step(void);
 void kernel_rewind_single_step(struct pt_regs *regs);
+void kernel_fastforward_single_step(struct pt_regs *regs);
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 int reinstall_suspended_bps(struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index f296662590c7..8689b95f6b53 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -73,6 +73,7 @@ void do_el0_svc_compat(struct pt_regs *regs);
 void do_el0_fpac(struct pt_regs *regs, unsigned long esr);
 void do_el1_fpac(struct pt_regs *regs, unsigned long esr);
 void do_el0_mops(struct pt_regs *regs, unsigned long esr);
+void do_el1_mops(struct pt_regs *regs, unsigned long esr);
 void do_serror(struct pt_regs *regs, unsigned long esr);
 void do_signal(struct pt_regs *regs);
 
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 024a7b245056..c60a4a90c6a5 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -441,6 +441,11 @@ void kernel_rewind_single_step(struct pt_regs *regs)
 	set_regs_spsr_ss(regs);
 }
 
+void kernel_fastforward_single_step(struct pt_regs *regs)
+{
+	clear_regs_spsr_ss(regs);
+}
+
 /* ptrace API */
 void user_enable_single_step(struct task_struct *task)
 {
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 3fcd9d080bf2..9d174cd541ef 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -463,6 +463,15 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr)
 	exit_to_kernel_mode(regs);
 }
 
+static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr)
+{
+	enter_from_kernel_mode(regs);
+	local_daif_inherit(regs);
+	do_el1_mops(regs, esr);
+	local_daif_mask();
+	exit_to_kernel_mode(regs);
+}
+
 static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr)
 {
 	unsigned long far = read_sysreg(far_el1);
@@ -505,6 +514,9 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
 	case ESR_ELx_EC_BTI:
 		el1_bti(regs, esr);
 		break;
+	case ESR_ELx_EC_MOPS:
+		el1_mops(regs, esr);
+		break;
 	case ESR_ELx_EC_BREAKPT_CUR:
 	case ESR_ELx_EC_SOFTSTP_CUR:
 	case ESR_ELx_EC_WATCHPT_CUR:
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 563cbce11126..fc6d44e06b8d 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -531,6 +531,13 @@ void do_el0_mops(struct pt_regs *regs, unsigned long esr)
 	user_fastforward_single_step(current);
 }
 
+void do_el1_mops(struct pt_regs *regs, unsigned long esr)
+{
+	arm64_mops_reset_regs(&regs->user_regs, esr);
+
+	kernel_fastforward_single_step(regs);
+}
+
 #define __user_cache_maint(insn, address, res)			\
 	if (address >= TASK_SIZE_MAX) {				\
 		res = -EFAULT;					\

From b616058c6613e1fd3e2bc8d4c05b558c8854aab3 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 30 Sep 2024 17:10:49 +0100
Subject: [PATCH 063/168] arm64: mops: Document booting requirement for
 HCR_EL2.MCE2

Document that hypervisors must set HCR_EL2.MCE2 and handle MOPS
exceptions when they migrate a vCPU to another type of CPU, as Linux may
not be able to handle the exception at all times.

As one example, when running under nested virtualization, KVM does not
handle MOPS exceptions from the nVHE/hVHE EL2 hyp as the hyp is never
migrated, so the host hypervisor needs to handle them. There may be
other situations (now or in the future) where the kernel can't handle an
unexpected MOPS exception, so require that the hypervisor handles them.

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Link: https://lore.kernel.org/r/20240930161051.3777828-4-kristina.martsenko@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/booting.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst
index b57776a68f15..db46af5b9f0f 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -385,6 +385,9 @@ Before jumping into the kernel, the following conditions must be met:
 
     - HCRX_EL2.MSCEn (bit 11) must be initialised to 0b1.
 
+    - HCRX_EL2.MCE2 (bit 10) must be initialised to 0b1. The exception
+      handler must set PSTATE.SS to 0b0.
+
   For CPUs with the Extended Translation Control Register feature (FEAT_TCR2):
 
   - If EL3 is present:

From 836ed3c4e473fef9e0814a2ba6dd40f9656c03f1 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 30 Sep 2024 17:10:50 +0100
Subject: [PATCH 064/168] arm64: lib: Use MOPS for memcpy() routines

Make memcpy(), memmove() and memset() use the Armv8.8 FEAT_MOPS
instructions when implemented on the CPU.

The CPY*/SET* instructions copy or set a block of memory of arbitrary
size and alignment. They can be interrupted by the CPU and the copying
resumed later. Their performance is expected to be close to the best
generic copy/set sequence of loads/stores for a given CPU. Using them in
the kernel's copy/set routines therefore avoids the need to periodically
rewrite the routines to optimize for new microarchitectures. It could
also lead to a performance improvement for some CPUs and systems.

With this change the kernel will always use the instructions if they are
implemented on the CPU (and have not been disabled by the arm64.nomops
command line parameter). When not implemented the usual routines will be
used (patched via alternatives). Note, we need to patch B/NOP instead of
the whole sequence to avoid executing a partially patched sequence in
case the compiler generates a mem*() call inside the alternatives
patching code.

Note that MOPS instructions have relaxed behavior on Device memory, but
it is expected that these routines are not generally used on MMIO.

Note: For memcpy(), this uses the CPY* instructions instead of CPYF*, as
CPY* allows overlaps between the source and destination buffers, and
despite contradicting the C standard, compilers require that memcpy()
work on exactly overlapping source and destination:
  https://gcc.gnu.org/onlinedocs/gcc/Standards.html#C-Language
  https://reviews.llvm.org/D86993

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Link: https://lore.kernel.org/r/20240930161051.3777828-5-kristina.martsenko@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig      |  3 +++
 arch/arm64/lib/memcpy.S | 19 ++++++++++++++++++-
 arch/arm64/lib/memset.S | 20 +++++++++++++++++++-
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e29b44d2d7b..d0fe90ea704d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2155,6 +2155,9 @@ config ARM64_EPAN
 	  if the cpu does not implement the feature.
 endmenu # "ARMv8.7 architectural features"
 
+config AS_HAS_MOPS
+	def_bool $(as-instr,.arch_extension mops)
+
 menu "ARMv8.9 architectural features"
 
 config ARM64_POE
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 4ab48d49c451..9b99106fb95f 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -57,7 +57,7 @@
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
-SYM_FUNC_START(__pi_memcpy)
+SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
@@ -238,7 +238,24 @@ L(copy64_from_start):
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstin]
 	ret
+SYM_FUNC_END(__pi_memcpy_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+SYM_FUNC_START(__pi_memcpy)
+alternative_if_not ARM64_HAS_MOPS
+	b	__pi_memcpy_generic
+alternative_else_nop_endif
+
+	mov	dst, dstin
+	cpyp	[dst]!, [src]!, count!
+	cpym	[dst]!, [src]!, count!
+	cpye	[dst]!, [src]!, count!
+	ret
 SYM_FUNC_END(__pi_memcpy)
+#else
+SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
+#endif
 
 SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
 EXPORT_SYMBOL(__memcpy)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index a5aebe82ad73..97157da65ec6 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -26,6 +26,7 @@
  */
 
 dstin		.req	x0
+val_x		.req	x1
 val		.req	w1
 count		.req	x2
 tmp1		.req	x3
@@ -42,7 +43,7 @@ dst		.req	x8
 tmp3w		.req	w9
 tmp3		.req	x9
 
-SYM_FUNC_START(__pi_memset)
+SYM_FUNC_START_LOCAL(__pi_memset_generic)
 	mov	dst, dstin	/* Preserve return value.  */
 	and	A_lw, val, #255
 	orr	A_lw, A_lw, A_lw, lsl #8
@@ -201,7 +202,24 @@ SYM_FUNC_START(__pi_memset)
 	ands	count, count, zva_bits_x
 	b.ne	.Ltail_maybe_long
 	ret
+SYM_FUNC_END(__pi_memset_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+SYM_FUNC_START(__pi_memset)
+alternative_if_not ARM64_HAS_MOPS
+	b	__pi_memset_generic
+alternative_else_nop_endif
+
+	mov	dst, dstin
+	setp	[dst]!, count!, val_x
+	setm	[dst]!, count!, val_x
+	sete	[dst]!, count!, val_x
+	ret
 SYM_FUNC_END(__pi_memset)
+#else
+SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
+#endif
 
 SYM_FUNC_ALIAS(__memset, __pi_memset)
 EXPORT_SYMBOL(__memset)

From ce6b5ff5f16dd9267d62d09b3af3f0c7dc3c24f0 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 30 Sep 2024 17:10:51 +0100
Subject: [PATCH 065/168] arm64: lib: Use MOPS for copy_page() and clear_page()

Similarly to what was done to the memcpy() routines, make copy_page()
and clear_page() also use the Armv8.8 FEAT_MOPS instructions.

Note: For copy_page() this uses the CPY* instructions instead of CPYF*
as CPYF* doesn't allow src and dst to be equal. It's not clear if
copy_page() needs to allow equal src and dst but it has worked so far
with the current implementation and there is no documentation forbidding
it.

Note, the unoptimized version of copy_page() in assembler.h is left as
it is.

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Link: https://lore.kernel.org/r/20240930161051.3777828-6-kristina.martsenko@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/lib/clear_page.S | 13 +++++++++++++
 arch/arm64/lib/copy_page.S  | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index ebde40e7fa2b..bd6f7d5eb6eb 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -15,6 +15,19 @@
  *	x0 - dest
  */
 SYM_FUNC_START(__pi_clear_page)
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+alternative_if_not ARM64_HAS_MOPS
+	b	.Lno_mops
+alternative_else_nop_endif
+
+	mov	x1, #PAGE_SIZE
+	setpn	[x0]!, x1!, xzr
+	setmn	[x0]!, x1!, xzr
+	seten	[x0]!, x1!, xzr
+	ret
+.Lno_mops:
+#endif
 	mrs	x1, dczid_el0
 	tbnz	x1, #4, 2f	/* Branch if DC ZVA is prohibited */
 	and	w1, w1, #0xf
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 6a56d7cf309d..e6374e7e5511 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -18,6 +18,19 @@
  *	x1 - src
  */
 SYM_FUNC_START(__pi_copy_page)
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+alternative_if_not ARM64_HAS_MOPS
+	b	.Lno_mops
+alternative_else_nop_endif
+
+	mov	x2, #PAGE_SIZE
+	cpypwn	[x0]!, [x1]!, x2!
+	cpymwn	[x0]!, [x1]!, x2!
+	cpyewn	[x0]!, [x1]!, x2!
+	ret
+.Lno_mops:
+#endif
 	ldp	x2, x3, [x1]
 	ldp	x4, x5, [x1, #16]
 	ldp	x6, x7, [x1, #32]

From c87df9cb9a216bc87951087a2592633cdc2737d4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:29 +0100
Subject: [PATCH 066/168] arm64: pt_regs: assert pt_regs is a multiple of 16
 bytes

To ensure that the stack is correctly aligned when branching to C code,
we require that struct pt_regs is a multiple of 16 bytes, as noted in a
comment.

Add an explicit assertion for this, so that any accidental violation of
this requirement will be caught by the compiler.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-2-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/ptrace.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 0abe975d68a8..10eca3a3fed4 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -149,8 +149,7 @@ static inline unsigned long pstate_to_compat_psr(const unsigned long pstate)
 
 /*
  * This struct defines the way the registers are stored on the stack during an
- * exception. Note that sizeof(struct pt_regs) has to be a multiple of 16 (for
- * stack alignment). struct user_pt_regs must form a prefix of struct pt_regs.
+ * exception. struct user_pt_regs must form a prefix of struct pt_regs.
  */
 struct pt_regs {
 	union {
@@ -180,6 +179,9 @@ struct pt_regs {
 	u64 exit_rcu;
 };
 
+/* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */
+static_assert(IS_ALIGNED(sizeof(struct pt_regs), 16));
+
 static inline bool in_syscall(struct pt_regs const *regs)
 {
 	return regs->syscallno != NO_SYSCALL;

From 2716d59bf48328ea1bc2a90d81488f715d4cdea4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:30 +0100
Subject: [PATCH 067/168] arm64: pt_regs: remove stale big-endian layout

For historical reasons the layout of struct pt_regs depends on the
configured endianness, with the order of the 'syscallno' and 'unused2'
fields varying dependent upon whether __AARCH64EB__ is defined. We no
longer depend on the order of these two fields and can remove the
ifdeffery.

The current conditional layout was introduced in commit:

  35d0e6fb4d219d64 ("arm64: syscallno is secretly an int, make it official")

At the time, this was necessary so that the entry assembly could use a
single STP instruction to save the pt_regs::{orig_x0,syscallno} fields,
without logic that was conditional on the endianness of the kernel:

| el0_svc_naked:
|         stp     x0, xscno, [sp, #S_ORIG_X0]     // save the original x0 and syscall number

This logic was converted to C in commit:

  f37099b6992a0b81 ("arm64: convert syscall trace logic to C")

Since that commit, we no longer manipulate pt_regs::orig_x0 from
assembly, and only manipulate pt_regs::syscallno as a 32-bit quantity
early in the kernel_entry assembly:

| /* Not in a syscall by default (el0_svc overwrites for real syscall) */
| .if     \el == 0
| mov     w21, #NO_SYSCALL
| str     w21, [sp, #S_SYSCALLNO]
| .endif

Given the above, there's no longer a need for the layout of
pt_regs::{syscallno,unused2} to depend on the endianness of the kernel.

This patch removes the ifdeffery and places 'syscallno' before 'unused2'
regardless of the endianess of the kernel. At the same time, 'unused2'
is renamed to 'unused', as it is the only unused field within pt_regs.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-3-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/ptrace.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 10eca3a3fed4..1ae671fe64d8 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -162,13 +162,9 @@ struct pt_regs {
 		};
 	};
 	u64 orig_x0;
-#ifdef __AARCH64EB__
-	u32 unused2;
 	s32 syscallno;
-#else
-	s32 syscallno;
-	u32 unused2;
-#endif
+	u32 unused;
+
 	u64 sdei_ttbr1;
 	/* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */
 	u64 pmr_save;

From 00d9597903d0053bb13c74dfe61bcba35e213bd7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:31 +0100
Subject: [PATCH 068/168] arm64: pt_regs: rename "pmr_save" -> "pmr"

The pt_regs::pmr_save field is weirdly named relative to all other
pt_regs fields, with a '_save' suffix that doesn't make anything clearer
and only leads to more typing to access the field.

Remove the '_save' suffix.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-4-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/daifflags.h | 2 +-
 arch/arm64/include/asm/processor.h | 2 +-
 arch/arm64/include/asm/ptrace.h    | 4 ++--
 arch/arm64/kernel/asm-offsets.c    | 2 +-
 arch/arm64/kernel/entry.S          | 4 ++--
 arch/arm64/kernel/process.c        | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h
index 55f57dfa8e2f..fbb5c99eb2f9 100644
--- a/arch/arm64/include/asm/daifflags.h
+++ b/arch/arm64/include/asm/daifflags.h
@@ -132,7 +132,7 @@ static inline void local_daif_inherit(struct pt_regs *regs)
 		trace_hardirqs_on();
 
 	if (system_uses_irq_prio_masking())
-		gic_write_pmr(regs->pmr_save);
+		gic_write_pmr(regs->pmr);
 
 	/*
 	 * We can't use local_daif_restore(regs->pstate) here as
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 1438424f0064..03b99164f7f4 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -293,7 +293,7 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
 	regs->pc = pc;
 
 	if (system_uses_irq_prio_masking())
-		regs->pmr_save = GIC_PRIO_IRQON;
+		regs->pmr = GIC_PRIO_IRQON;
 }
 
 static inline void start_thread(struct pt_regs *regs, unsigned long pc,
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 1ae671fe64d8..e82e6e47ac9a 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -167,7 +167,7 @@ struct pt_regs {
 
 	u64 sdei_ttbr1;
 	/* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */
-	u64 pmr_save;
+	u64 pmr;
 	u64 stackframe[2];
 
 	/* Only valid for some EL1 exceptions. */
@@ -211,7 +211,7 @@ static inline void forget_syscall(struct pt_regs *regs)
 
 #define irqs_priority_unmasked(regs)					\
 	(system_uses_irq_prio_masking() ?				\
-		(regs)->pmr_save == GIC_PRIO_IRQON :			\
+		(regs)->pmr == GIC_PRIO_IRQON :				\
 		true)
 
 #define interrupts_enabled(regs)			\
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index 27de1dddb0ab..eb7fb2f9b927 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -79,7 +79,7 @@ int main(void)
   DEFINE(S_PSTATE,		offsetof(struct pt_regs, pstate));
   DEFINE(S_SYSCALLNO,		offsetof(struct pt_regs, syscallno));
   DEFINE(S_SDEI_TTBR1,		offsetof(struct pt_regs, sdei_ttbr1));
-  DEFINE(S_PMR_SAVE,		offsetof(struct pt_regs, pmr_save));
+  DEFINE(S_PMR,			offsetof(struct pt_regs, pmr));
   DEFINE(S_STACKFRAME,		offsetof(struct pt_regs, stackframe));
   DEFINE(PT_REGS_SIZE,		sizeof(struct pt_regs));
   BLANK();
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 7ef0e127b149..a84ce95ad96c 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -315,7 +315,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING
 alternative_else_nop_endif
 
 	mrs_s	x20, SYS_ICC_PMR_EL1
-	str	x20, [sp, #S_PMR_SAVE]
+	str	x20, [sp, #S_PMR]
 	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
 	msr_s	SYS_ICC_PMR_EL1, x20
 
@@ -342,7 +342,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING
 	b	.Lskip_pmr_restore\@
 alternative_else_nop_endif
 
-	ldr	x20, [sp, #S_PMR_SAVE]
+	ldr	x20, [sp, #S_PMR]
 	msr_s	SYS_ICC_PMR_EL1, x20
 
 	/* Ensure priority change is seen by redistributor */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 0540653fbf38..2951b5ba1937 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -227,7 +227,7 @@ void __show_regs(struct pt_regs *regs)
 	printk("sp : %016llx\n", sp);
 
 	if (system_uses_irq_prio_masking())
-		printk("pmr_save: %08llx\n", regs->pmr_save);
+		printk("pmr: %08llx\n", regs->pmr);
 
 	i = top_reg;
 

From 1454363098a0f7f14479f6a45945bf8d3d775a95 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:32 +0100
Subject: [PATCH 069/168] arm64: pt_regs: swap 'unused' and 'pmr' fields

In subsequent patches we'll want to add an additional u64 to struct
pt_regs. To make space, this patch swaps the 'unused' and 'pmr' fields,
as the 'pmr' value only requires bits[7:0] and can safely fit into a
u32, which frees up a 64-bit unused field.

The 'lockdep_hardirqs' and 'exit_rcu' fields should eventually be moved
out of pt_regs and managed locally within entry-common.c, so I've left
those as-is for the moment.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-5-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/ptrace.h | 6 +++---
 arch/arm64/kernel/entry.S       | 4 ++--
 arch/arm64/kernel/process.c     | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index e82e6e47ac9a..92531aeba531 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -163,11 +163,11 @@ struct pt_regs {
 	};
 	u64 orig_x0;
 	s32 syscallno;
-	u32 unused;
+	u32 pmr;
 
 	u64 sdei_ttbr1;
-	/* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */
-	u64 pmr;
+	u64 unused;
+
 	u64 stackframe[2];
 
 	/* Only valid for some EL1 exceptions. */
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a84ce95ad96c..fa6d6d5ca5e0 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -315,7 +315,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING
 alternative_else_nop_endif
 
 	mrs_s	x20, SYS_ICC_PMR_EL1
-	str	x20, [sp, #S_PMR]
+	str	w20, [sp, #S_PMR]
 	mov	x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
 	msr_s	SYS_ICC_PMR_EL1, x20
 
@@ -342,7 +342,7 @@ alternative_if_not ARM64_HAS_GIC_PRIO_MASKING
 	b	.Lskip_pmr_restore\@
 alternative_else_nop_endif
 
-	ldr	x20, [sp, #S_PMR]
+	ldr	w20, [sp, #S_PMR]
 	msr_s	SYS_ICC_PMR_EL1, x20
 
 	/* Ensure priority change is seen by redistributor */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 2951b5ba1937..c722c1be6fa5 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -227,7 +227,7 @@ void __show_regs(struct pt_regs *regs)
 	printk("sp : %016llx\n", sp);
 
 	if (system_uses_irq_prio_masking())
-		printk("pmr: %08llx\n", regs->pmr);
+		printk("pmr: %08x\n", regs->pmr);
 
 	i = top_reg;
 

From 886c2b0ba820b9d6ffe3a7c670eb2f519755123c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:33 +0100
Subject: [PATCH 070/168] arm64: use a common struct frame_record

Currently the signal handling code has its own struct frame_record,
the definition of struct pt_regs open-codes a frame record as an array,
and the kernel unwinder hard-codes frame record offsets.

Move to a common struct frame_record that can be used throughout the
kernel.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-6-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/ptrace.h            |  4 +++-
 arch/arm64/include/asm/stacktrace/common.h |  8 +++++---
 arch/arm64/include/asm/stacktrace/frame.h  | 13 +++++++++++++
 arch/arm64/kernel/process.c                |  2 +-
 arch/arm64/kernel/signal.c                 |  5 -----
 arch/arm64/kernel/stacktrace.c             |  2 +-
 6 files changed, 23 insertions(+), 11 deletions(-)
 create mode 100644 arch/arm64/include/asm/stacktrace/frame.h

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 92531aeba531..89c02f85f4b1 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -98,6 +98,8 @@
 #include <linux/bug.h>
 #include <linux/types.h>
 
+#include <asm/stacktrace/frame.h>
+
 /* sizeof(struct user) for AArch32 */
 #define COMPAT_USER_SZ	296
 
@@ -168,7 +170,7 @@ struct pt_regs {
 	u64 sdei_ttbr1;
 	u64 unused;
 
-	u64 stackframe[2];
+	struct frame_record stackframe;
 
 	/* Only valid for some EL1 exceptions. */
 	u64 lockdep_hardirqs;
diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h
index f63dc654e545..7fab6876e497 100644
--- a/arch/arm64/include/asm/stacktrace/common.h
+++ b/arch/arm64/include/asm/stacktrace/common.h
@@ -137,21 +137,23 @@ found:
 static inline int
 unwind_next_frame_record(struct unwind_state *state)
 {
+	struct frame_record *record;
 	unsigned long fp = state->fp;
 	int err;
 
 	if (fp & 0x7)
 		return -EINVAL;
 
-	err = unwind_consume_stack(state, fp, 16);
+	err = unwind_consume_stack(state, fp, sizeof(*record));
 	if (err)
 		return err;
 
 	/*
 	 * Record this frame record's values.
 	 */
-	state->fp = READ_ONCE(*(unsigned long *)(fp));
-	state->pc = READ_ONCE(*(unsigned long *)(fp + 8));
+	record = (struct frame_record *)fp;
+	state->fp = READ_ONCE(record->fp);
+	state->pc = READ_ONCE(record->lr);
 
 	return 0;
 }
diff --git a/arch/arm64/include/asm/stacktrace/frame.h b/arch/arm64/include/asm/stacktrace/frame.h
new file mode 100644
index 000000000000..6397bc847f14
--- /dev/null
+++ b/arch/arm64/include/asm/stacktrace/frame.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_STACKTRACE_FRAME_H
+#define __ASM_STACKTRACE_FRAME_H
+
+/*
+ * A standard AAPCS64 frame record.
+ */
+struct frame_record {
+	u64 fp;
+	u64 lr;
+};
+
+#endif /* __ASM_STACKTRACE_FRAME_H */
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index c722c1be6fa5..d45fd114eac3 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -419,7 +419,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	 * For the benefit of the unwinder, set up childregs->stackframe
 	 * as the final frame for the new task.
 	 */
-	p->thread.cpu_context.fp = (unsigned long)childregs->stackframe;
+	p->thread.cpu_context.fp = (unsigned long)&childregs->stackframe;
 
 	ptrace_hw_copy_thread(p);
 
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 561986947530..2c47f9a0e40b 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -42,11 +42,6 @@ struct rt_sigframe {
 	struct ucontext uc;
 };
 
-struct frame_record {
-	u64 fp;
-	u64 lr;
-};
-
 struct rt_sigframe_user_layout {
 	struct rt_sigframe __user *sigframe;
 	struct frame_record __user *next_frame;
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 2729faaee4b4..ffe8e4f54956 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -145,7 +145,7 @@ kunwind_next(struct kunwind_state *state)
 	int err;
 
 	/* Final frame; nothing to unwind */
-	if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
+	if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe)
 		return -ENOENT;
 
 	err = unwind_next_frame_record(&state->common);

From b7794795c93d9c15e4bc64db2f3bf2104451e3bc Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:34 +0100
Subject: [PATCH 071/168] arm64: stacktrace: move dump_backtrace() to
 kunwind_stack_walk()

Currently dump_backtrace() can only print the PC value at each step of
the unwind, as this is all the information that arch_stack_walk()
passes to the dump_backtrace_entry() callback.

In future we'd like to print some additional information, such as the
origin of entries (e.g. PC, LR, FP) and/or the reliability thereof.

In preparation for doing so, this patch moves dump_backtrace() over to
kunwind_stack_walk(), which passes the full kunwind_state to the
callback.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-7-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/stacktrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index ffe8e4f54956..22f8709f0e76 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -294,10 +294,10 @@ noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u6
 	kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL);
 }
 
-static bool dump_backtrace_entry(void *arg, unsigned long where)
+static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg)
 {
 	char *loglvl = arg;
-	printk("%s %pSb\n", loglvl, (void *)where);
+	printk("%s %pSb\n", loglvl, (void *)state->common.pc);
 	return true;
 }
 
@@ -316,7 +316,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 		return;
 
 	printk("%sCall trace:\n", loglvl);
-	arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs);
+	kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs);
 
 	put_task_stack(tsk);
 }

From bdf8eafbf7f56f9fa43a019cdc1a5f057210f01d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:35 +0100
Subject: [PATCH 072/168] arm64: stacktrace: report source of unwind data

When analysing a stacktrace it can be useful to know where an unwound PC
came from, as in some situations certain sources may be suspect or known
to be unreliable. In future it would also be useful to track this so
that certain unwind steps can be performed in a stateful manner. For
example when unwinding across an exception boundary, we'd ideally unwind
pt_regs::pc, then pt_regs::lr, then the next frame record.

This patch adds an enumerated set of unwind sources, tracks this during
the unwind, and updates dump_backtrace() to log these for interesting
unwind steps.

The interesting sources recorded are:

 "C" - the PC came from the caller of an unwind function.
 "T" - the PC came from thread_saved_pc() for a blocked task.
 "P" - the PC came from a pt_regs::pc.
 "U" - the PC came from an unknown source (indicates an unwinder error).

... with nothing recorded when the PC came from a frame_record::pc as
this is the vastly common case and logging this would make it difficult
to spot the more interesting cases.

For example, when triggering a backtrace via magic-sysrq + L, the CPU
handling the sysrq will have a backtrace whose first element is the
caller (C) of dump_backtrace():

| Call trace:
|  show_stack+0x18/0x30 (C)
|  dump_stack_lvl+0x60/0x80
|  dump_stack+0x18/0x24
|  nmi_cpu_backtrace+0xfc/0x140
| ...

... and other CPUs will have a backtrace whose first element is their
pt_regs::pc (P) at the instant the backtrace IPI was taken:

| Call trace:
|  _raw_spin_unlock_irqrestore+0x8/0x50 (P)
|  wake_up_process+0x18/0x24
|  process_timeout+0x14/0x20
|  call_timer_fn.isra.0+0x24/0x80
| ...

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-8-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/stacktrace.c | 47 +++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 22f8709f0e76..83ab37e13159 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -20,6 +20,14 @@
 #include <asm/stack_pointer.h>
 #include <asm/stacktrace.h>
 
+enum kunwind_source {
+	KUNWIND_SOURCE_UNKNOWN,
+	KUNWIND_SOURCE_FRAME,
+	KUNWIND_SOURCE_CALLER,
+	KUNWIND_SOURCE_TASK,
+	KUNWIND_SOURCE_REGS_PC,
+};
+
 /*
  * Kernel unwind state
  *
@@ -37,6 +45,7 @@ struct kunwind_state {
 #ifdef CONFIG_KRETPROBES
 	struct llist_node *kr_cur;
 #endif
+	enum kunwind_source source;
 };
 
 static __always_inline void
@@ -45,6 +54,7 @@ kunwind_init(struct kunwind_state *state,
 {
 	unwind_init_common(&state->common);
 	state->task = task;
+	state->source = KUNWIND_SOURCE_UNKNOWN;
 }
 
 /*
@@ -62,6 +72,7 @@ kunwind_init_from_regs(struct kunwind_state *state,
 
 	state->common.fp = regs->regs[29];
 	state->common.pc = regs->pc;
+	state->source = KUNWIND_SOURCE_REGS_PC;
 }
 
 /*
@@ -79,6 +90,7 @@ kunwind_init_from_caller(struct kunwind_state *state)
 
 	state->common.fp = (unsigned long)__builtin_frame_address(1);
 	state->common.pc = (unsigned long)__builtin_return_address(0);
+	state->source = KUNWIND_SOURCE_CALLER;
 }
 
 /*
@@ -99,6 +111,7 @@ kunwind_init_from_task(struct kunwind_state *state,
 
 	state->common.fp = thread_saved_fp(task);
 	state->common.pc = thread_saved_pc(task);
+	state->source = KUNWIND_SOURCE_TASK;
 }
 
 static __always_inline int
@@ -148,9 +161,19 @@ kunwind_next(struct kunwind_state *state)
 	if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe)
 		return -ENOENT;
 
-	err = unwind_next_frame_record(&state->common);
-	if (err)
-		return err;
+	switch (state->source) {
+	case KUNWIND_SOURCE_FRAME:
+	case KUNWIND_SOURCE_CALLER:
+	case KUNWIND_SOURCE_TASK:
+	case KUNWIND_SOURCE_REGS_PC:
+		err = unwind_next_frame_record(&state->common);
+		if (err)
+			return err;
+		state->source = KUNWIND_SOURCE_FRAME;
+		break;
+	default:
+		return -EINVAL;
+	}
 
 	state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc);
 
@@ -294,10 +317,26 @@ noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u6
 	kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL);
 }
 
+static const char *state_source_string(const struct kunwind_state *state)
+{
+	switch (state->source) {
+	case KUNWIND_SOURCE_FRAME:	return NULL;
+	case KUNWIND_SOURCE_CALLER:	return "C";
+	case KUNWIND_SOURCE_TASK:	return "T";
+	case KUNWIND_SOURCE_REGS_PC:	return "P";
+	default:			return "U";
+	}
+}
+
 static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg)
 {
+	const char *source = state_source_string(state);
 	char *loglvl = arg;
-	printk("%s %pSb\n", loglvl, (void *)state->common.pc);
+	printk("%s %pSb%s%s%s\n", loglvl,
+		(void *)state->common.pc,
+		source ? " (" : "",
+		source ? source : "",
+		source ? ")" : "");
 	return true;
 }
 

From 8094df1cf09248e60afd0e14fb6c2ba4c79b0b9c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:36 +0100
Subject: [PATCH 073/168] arm64: stacktrace: report recovered PCs

When analysing a stacktrace it can be useful to know whether an unwound
PC has been rewritten by fgraph or kretprobes, as in some situations
these may be suspect or be known to be unreliable.

This patch adds flags to track when an unwind entry has recovered the PC
from fgraph and/or kretprobes, and updates dump_backtrace() to log when
this is the case.

The flags recorded are:

 "F" - the PC was recovered from fgraph
 "K" - the PC was recovered from kretprobes

These flags are recorded and logged in addition to the original source
of the unwound PC.

For example, with the ftrace_graph profiler enabled globally, and
kretprobes installed on generic_handle_domain_irq() and
do_interrupt_handler(), a backtrace triggered by magic-sysrq + L
reports:

| Call trace:
|  show_stack+0x20/0x40 (CF)
|  dump_stack_lvl+0x60/0x80 (F)
|  dump_stack+0x18/0x28
|  nmi_cpu_backtrace+0xfc/0x140
|  nmi_trigger_cpumask_backtrace+0x1c8/0x200
|  arch_trigger_cpumask_backtrace+0x20/0x40
|  sysrq_handle_showallcpus+0x24/0x38 (F)
|  __handle_sysrq+0xa8/0x1b0 (F)
|  handle_sysrq+0x38/0x50 (F)
|  pl011_int+0x460/0x5a8 (F)
|  __handle_irq_event_percpu+0x60/0x220 (F)
|  handle_irq_event+0x54/0xc0 (F)
|  handle_fasteoi_irq+0xa8/0x1d0 (F)
|  generic_handle_domain_irq+0x34/0x58 (F)
|  gic_handle_irq+0x54/0x140 (FK)
|  call_on_irq_stack+0x24/0x58 (F)
|  do_interrupt_handler+0x88/0xa0
|  el1_interrupt+0x34/0x68 (FK)
|  el1h_64_irq_handler+0x18/0x28
|  el1h_64_irq+0x64/0x68
|  default_idle_call+0x34/0x180
|  do_idle+0x204/0x268
|  cpu_startup_entry+0x40/0x50 (F)
|  rest_init+0xe4/0xf0
|  start_kernel+0x744/0x750
|  __primary_switched+0x80/0x90

Note that as these flags are reported next to the recovered PC value,
they appear on the callers of instrumented functions. For example
gic_handle_irq() has a "K" marker because generic_handle_domain_irq()
was instrumented with kretprobes and had its return address rewritten.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-9-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/stacktrace.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 83ab37e13159..f8e231683dad 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -28,6 +28,14 @@ enum kunwind_source {
 	KUNWIND_SOURCE_REGS_PC,
 };
 
+union unwind_flags {
+	unsigned long	all;
+	struct {
+		unsigned long	fgraph : 1,
+				kretprobe : 1;
+	};
+};
+
 /*
  * Kernel unwind state
  *
@@ -46,6 +54,7 @@ struct kunwind_state {
 	struct llist_node *kr_cur;
 #endif
 	enum kunwind_source source;
+	union unwind_flags flags;
 };
 
 static __always_inline void
@@ -55,6 +64,7 @@ kunwind_init(struct kunwind_state *state,
 	unwind_init_common(&state->common);
 	state->task = task;
 	state->source = KUNWIND_SOURCE_UNKNOWN;
+	state->flags.all = 0;
 }
 
 /*
@@ -127,6 +137,7 @@ kunwind_recover_return_address(struct kunwind_state *state)
 		if (WARN_ON_ONCE(state->common.pc == orig_pc))
 			return -EINVAL;
 		state->common.pc = orig_pc;
+		state->flags.fgraph = 1;
 	}
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
@@ -137,6 +148,7 @@ kunwind_recover_return_address(struct kunwind_state *state)
 						  (void *)state->common.fp,
 						  &state->kr_cur);
 		state->common.pc = orig_pc;
+		state->flags.kretprobe = 1;
 	}
 #endif /* CONFIG_KRETPROBES */
 
@@ -157,6 +169,8 @@ kunwind_next(struct kunwind_state *state)
 	unsigned long fp = state->common.fp;
 	int err;
 
+	state->flags.all = 0;
+
 	/* Final frame; nothing to unwind */
 	if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe)
 		return -ENOENT;
@@ -331,12 +345,18 @@ static const char *state_source_string(const struct kunwind_state *state)
 static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg)
 {
 	const char *source = state_source_string(state);
+	union unwind_flags flags = state->flags;
+	bool has_info = source || flags.all;
 	char *loglvl = arg;
-	printk("%s %pSb%s%s%s\n", loglvl,
+
+	printk("%s %pSb%s%s%s%s%s\n", loglvl,
 		(void *)state->common.pc,
-		source ? " (" : "",
+		has_info ? " (" : "",
 		source ? source : "",
-		source ? ")" : "");
+		flags.fgraph ? "F" : "",
+		flags.kretprobe ? "K" : "",
+		has_info ? ")" : "");
+
 	return true;
 }
 

From f05a4a42de9031a819334a90b353ac48fd94f3a4 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:37 +0100
Subject: [PATCH 074/168] arm64: stacktrace: split unwind_consume_stack()

When unwinding stacks, we use unwind_consume_stack() to both find
whether an object (e.g. a frame record) is on an accessible stack *and*
to update the stack boundaries. This works fine today since we only care
about one type of object which does not overlap other objects.

In subsequent patches we'll want to check whether an object (e.g a frame
record) is on the stack and follow this up by accessing a larger object
containing the first (e.g. a pt_regs with an embedded frame record).

To make that pattern easier to implement, this patch reworks
unwind_find_next_stack() and unwind_consume_stack() so that the former
can be used to check if an object is on any accessible stack, and the
latter is purely used to update the stack boundaries.

As unwind_find_next_stack() is modified to also check the stack
currently being unwound, it is renamed to unwind_find_stack().

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-10-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/stacktrace/common.h | 68 +++++++++++++---------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h
index 7fab6876e497..821a8fdd31af 100644
--- a/arch/arm64/include/asm/stacktrace/common.h
+++ b/arch/arm64/include/asm/stacktrace/common.h
@@ -60,13 +60,27 @@ static inline void unwind_init_common(struct unwind_state *state)
 	state->stack = stackinfo_get_unknown();
 }
 
-static struct stack_info *unwind_find_next_stack(const struct unwind_state *state,
-						 unsigned long sp,
-						 unsigned long size)
+/**
+ * unwind_find_stack() - Find the accessible stack which entirely contains an
+ * object.
+ *
+ * @state: the current unwind state.
+ * @sp:    the base address of the object.
+ * @size:  the size of the object.
+ *
+ * Return: a pointer to the relevant stack_info if found; NULL otherwise.
+ */
+static struct stack_info *unwind_find_stack(struct unwind_state *state,
+					    unsigned long sp,
+					    unsigned long size)
 {
-	for (int i = 0; i < state->nr_stacks; i++) {
-		struct stack_info *info = &state->stacks[i];
+	struct stack_info *info = &state->stack;
 
+	if (stackinfo_on_stack(info, sp, size))
+		return info;
+
+	for (int i = 0; i < state->nr_stacks; i++) {
+		info = &state->stacks[i];
 		if (stackinfo_on_stack(info, sp, size))
 			return info;
 	}
@@ -75,36 +89,31 @@ static struct stack_info *unwind_find_next_stack(const struct unwind_state *stat
 }
 
 /**
- * unwind_consume_stack() - Check if an object is on an accessible stack,
- * updating stack boundaries so that future unwind steps cannot consume this
- * object again.
+ * unwind_consume_stack() - Update stack boundaries so that future unwind steps
+ * cannot consume this object again.
  *
  * @state: the current unwind state.
+ * @info:  the stack_info of the stack containing the object.
  * @sp:    the base address of the object.
  * @size:  the size of the object.
  *
  * Return: 0 upon success, an error code otherwise.
  */
-static inline int unwind_consume_stack(struct unwind_state *state,
-				       unsigned long sp,
-				       unsigned long size)
+static inline void unwind_consume_stack(struct unwind_state *state,
+					struct stack_info *info,
+					unsigned long sp,
+					unsigned long size)
 {
-	struct stack_info *next;
-
-	if (stackinfo_on_stack(&state->stack, sp, size))
-		goto found;
-
-	next = unwind_find_next_stack(state, sp, size);
-	if (!next)
-		return -EINVAL;
+	struct stack_info tmp;
 
 	/*
 	 * Stack transitions are strictly one-way, and once we've
 	 * transitioned from one stack to another, it's never valid to
 	 * unwind back to the old stack.
 	 *
-	 * Remove the current stack from the list of stacks so that it cannot
-	 * be found on a subsequent transition.
+	 * Destroy the old stack info so that it cannot be found upon a
+	 * subsequent transition. If the stack has not changed, we'll
+	 * immediately restore the current stack info.
 	 *
 	 * Note that stacks can nest in several valid orders, e.g.
 	 *
@@ -115,16 +124,15 @@ static inline int unwind_consume_stack(struct unwind_state *state,
 	 * ... so we do not check the specific order of stack
 	 * transitions.
 	 */
-	state->stack = *next;
-	*next = stackinfo_get_unknown();
+	tmp = *info;
+	*info = stackinfo_get_unknown();
+	state->stack = tmp;
 
-found:
 	/*
 	 * Future unwind steps can only consume stack above this frame record.
 	 * Update the current stack to start immediately above it.
 	 */
 	state->stack.low = sp + size;
-	return 0;
 }
 
 /**
@@ -137,16 +145,18 @@ found:
 static inline int
 unwind_next_frame_record(struct unwind_state *state)
 {
+	struct stack_info *info;
 	struct frame_record *record;
 	unsigned long fp = state->fp;
-	int err;
 
 	if (fp & 0x7)
 		return -EINVAL;
 
-	err = unwind_consume_stack(state, fp, sizeof(*record));
-	if (err)
-		return err;
+	info = unwind_find_stack(state, fp, sizeof(*record));
+	if (!info)
+		return -EINVAL;
+
+	unwind_consume_stack(state, info, fp, sizeof(*record));
 
 	/*
 	 * Record this frame record's values.

From c2c6b27b5aa14fa28e3f455f697ccd2e0e75d773 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 17 Oct 2024 10:25:38 +0100
Subject: [PATCH 075/168] arm64: stacktrace: unwind exception boundaries

When arm64's stack unwinder encounters an exception boundary, it uses
the pt_regs::stackframe created by the entry code, which has a copy of
the PC and FP at the time the exception was taken. The unwinder doesn't
know anything about pt_regs, and reports the PC from the stackframe, but
does not report the LR.

The LR is only guaranteed to contain the return address at function call
boundaries, and can be used as a scratch register at other times, so the
LR at an exception boundary may or may not be a legitimate return
address. It would be useful to report the LR value regardless, as it can
be helpful when debugging, and in future it will be helpful for reliable
stacktrace support.

This patch changes the way we unwind across exception boundaries,
allowing both the PC and LR to be reported. The entry code creates a
frame_record_meta structure embedded within pt_regs, which the unwinder
uses to find the pt_regs. The unwinder can then extract pt_regs::pc and
pt_regs::lr as two separate unwind steps before continuing with a
regular walk of frame records.

When a PC is unwound from pt_regs::lr, dump_backtrace() will log this
with an "L" marker so that it can be identified easily. For example,
an unwind across an exception boundary will appear as follows:

|  el1h_64_irq+0x6c/0x70
|  _raw_spin_unlock_irqrestore+0x10/0x60 (P)
|  __aarch64_insn_write+0x6c/0x90 (L)
|  aarch64_insn_patch_text_nosync+0x28/0x80

... with a (P) entry for pt_regs::pc, and an (L) entry for pt_regs:lr.

Note that the LR may be stale at the point of the exception, for example,
shortly after a return:

|  el1h_64_irq+0x6c/0x70
|  default_idle_call+0x34/0x180 (P)
|  default_idle_call+0x28/0x180 (L)
|  do_idle+0x204/0x268

... where the LR points a few instructions before the current PC.

This plays nicely with all the other unwind metadata tracking. With the
ftrace_graph profiler enabled globally, and kretprobes installed on
generic_handle_domain_irq() and do_interrupt_handler(), a backtrace triggered
by magic-sysrq + L reports:

| Call trace:
|  show_stack+0x20/0x40 (CF)
|  dump_stack_lvl+0x60/0x80 (F)
|  dump_stack+0x18/0x28
|  nmi_cpu_backtrace+0xfc/0x140
|  nmi_trigger_cpumask_backtrace+0x1c8/0x200
|  arch_trigger_cpumask_backtrace+0x20/0x40
|  sysrq_handle_showallcpus+0x24/0x38 (F)
|  __handle_sysrq+0xa8/0x1b0 (F)
|  handle_sysrq+0x38/0x50 (F)
|  pl011_int+0x460/0x5a8 (F)
|  __handle_irq_event_percpu+0x60/0x220 (F)
|  handle_irq_event+0x54/0xc0 (F)
|  handle_fasteoi_irq+0xa8/0x1d0 (F)
|  generic_handle_domain_irq+0x34/0x58 (F)
|  gic_handle_irq+0x54/0x140 (FK)
|  call_on_irq_stack+0x24/0x58 (F)
|  do_interrupt_handler+0x88/0xa0
|  el1_interrupt+0x34/0x68 (FK)
|  el1h_64_irq_handler+0x18/0x28
|  el1h_64_irq+0x6c/0x70
|  default_idle_call+0x34/0x180 (P)
|  default_idle_call+0x28/0x180 (L)
|  do_idle+0x204/0x268
|  cpu_startup_entry+0x3c/0x50 (F)
|  rest_init+0xe4/0xf0
|  start_kernel+0x744/0x750
|  __primary_switched+0x88/0x98

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241017092538.1859841-11-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/ptrace.h           |   4 +-
 arch/arm64/include/asm/stacktrace/frame.h |  35 +++++++
 arch/arm64/kernel/asm-offsets.c           |   1 +
 arch/arm64/kernel/entry.S                 |  12 ++-
 arch/arm64/kernel/head.S                  |   3 +
 arch/arm64/kernel/process.c               |   1 +
 arch/arm64/kernel/stacktrace.c            | 121 ++++++++++++++++++++--
 7 files changed, 158 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 89c02f85f4b1..47ff8654c5ec 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -168,9 +168,7 @@ struct pt_regs {
 	u32 pmr;
 
 	u64 sdei_ttbr1;
-	u64 unused;
-
-	struct frame_record stackframe;
+	struct frame_record_meta stackframe;
 
 	/* Only valid for some EL1 exceptions. */
 	u64 lockdep_hardirqs;
diff --git a/arch/arm64/include/asm/stacktrace/frame.h b/arch/arm64/include/asm/stacktrace/frame.h
index 6397bc847f14..0ee0f6ba0fd8 100644
--- a/arch/arm64/include/asm/stacktrace/frame.h
+++ b/arch/arm64/include/asm/stacktrace/frame.h
@@ -3,6 +3,30 @@
 #define __ASM_STACKTRACE_FRAME_H
 
 /*
+ * - FRAME_META_TYPE_NONE
+ *
+ *   This value is reserved.
+ *
+ * - FRAME_META_TYPE_FINAL
+ *
+ *   The record is the last entry on the stack.
+ *   Unwinding should terminate successfully.
+ *
+ * - FRAME_META_TYPE_PT_REGS
+ *
+ *   The record is embedded within a struct pt_regs, recording the registers at
+ *   an arbitrary point in time.
+ *   Unwinding should consume pt_regs::pc, followed by pt_regs::lr.
+ *
+ * Note: all other values are reserved and should result in unwinding
+ * terminating with an error.
+ */
+#define FRAME_META_TYPE_NONE		0
+#define FRAME_META_TYPE_FINAL		1
+#define FRAME_META_TYPE_PT_REGS		2
+
+#ifndef __ASSEMBLY__
+/* 
  * A standard AAPCS64 frame record.
  */
 struct frame_record {
@@ -10,4 +34,15 @@ struct frame_record {
 	u64 lr;
 };
 
+/*
+ * A metadata frame record indicating a special unwind.
+ * The record::{fp,lr} fields must be zero to indicate the presence of
+ * metadata.
+ */
+struct frame_record_meta {
+	struct frame_record record;
+	u64 type;
+};
+#endif /* __ASSEMBLY */
+
 #endif /* __ASM_STACKTRACE_FRAME_H */
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index eb7fb2f9b927..021f04f97fde 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -81,6 +81,7 @@ int main(void)
   DEFINE(S_SDEI_TTBR1,		offsetof(struct pt_regs, sdei_ttbr1));
   DEFINE(S_PMR,			offsetof(struct pt_regs, pmr));
   DEFINE(S_STACKFRAME,		offsetof(struct pt_regs, stackframe));
+  DEFINE(S_STACKFRAME_TYPE,	offsetof(struct pt_regs, stackframe.type));
   DEFINE(PT_REGS_SIZE,		sizeof(struct pt_regs));
   BLANK();
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index fa6d6d5ca5e0..5ae2a34b50bd 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -25,6 +25,7 @@
 #include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/scs.h>
+#include <asm/stacktrace/frame.h>
 #include <asm/thread_info.h>
 #include <asm/asm-uaccess.h>
 #include <asm/unistd.h>
@@ -284,15 +285,16 @@ alternative_else_nop_endif
 	stp	lr, x21, [sp, #S_LR]
 
 	/*
-	 * For exceptions from EL0, create a final frame record.
-	 * For exceptions from EL1, create a synthetic frame record so the
-	 * interrupted code shows up in the backtrace.
+	 * Create a metadata frame record. The unwinder will use this to
+	 * identify and unwind exception boundaries.
 	 */
-	.if \el == 0
 	stp	xzr, xzr, [sp, #S_STACKFRAME]
+	.if \el == 0
+	mov	x0, #FRAME_META_TYPE_FINAL
 	.else
-	stp	x29, x22, [sp, #S_STACKFRAME]
+	mov	x0, #FRAME_META_TYPE_PT_REGS
 	.endif
+	str	x0, [sp, #S_STACKFRAME_TYPE]
 	add	x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index cb68adcabe07..5ab1970ee543 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -32,6 +32,7 @@
 #include <asm/scs.h>
 #include <asm/smp.h>
 #include <asm/sysreg.h>
+#include <asm/stacktrace/frame.h>
 #include <asm/thread_info.h>
 #include <asm/virt.h>
 
@@ -199,6 +200,8 @@ SYM_CODE_END(preserve_boot_args)
 	sub	sp, sp, #PT_REGS_SIZE
 
 	stp	xzr, xzr, [sp, #S_STACKFRAME]
+	mov	\tmp1, #FRAME_META_TYPE_FINAL
+	str	\tmp1, [sp, #S_STACKFRAME_TYPE]
 	add	x29, sp, #S_STACKFRAME
 
 	scs_load_current
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index d45fd114eac3..29904c829de2 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -409,6 +409,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 		 */
 		memset(childregs, 0, sizeof(struct pt_regs));
 		childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT;
+		childregs->stackframe.type = FRAME_META_TYPE_FINAL;
 
 		p->thread.cpu_context.x19 = (unsigned long)args->fn;
 		p->thread.cpu_context.x20 = (unsigned long)args->fn_arg;
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index f8e231683dad..caef85462acb 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -26,6 +26,7 @@ enum kunwind_source {
 	KUNWIND_SOURCE_CALLER,
 	KUNWIND_SOURCE_TASK,
 	KUNWIND_SOURCE_REGS_PC,
+	KUNWIND_SOURCE_REGS_LR,
 };
 
 union unwind_flags {
@@ -55,6 +56,7 @@ struct kunwind_state {
 #endif
 	enum kunwind_source source;
 	union unwind_flags flags;
+	struct pt_regs *regs;
 };
 
 static __always_inline void
@@ -65,6 +67,7 @@ kunwind_init(struct kunwind_state *state,
 	state->task = task;
 	state->source = KUNWIND_SOURCE_UNKNOWN;
 	state->flags.all = 0;
+	state->regs = NULL;
 }
 
 /*
@@ -80,6 +83,7 @@ kunwind_init_from_regs(struct kunwind_state *state,
 {
 	kunwind_init(state, current);
 
+	state->regs = regs;
 	state->common.fp = regs->regs[29];
 	state->common.pc = regs->pc;
 	state->source = KUNWIND_SOURCE_REGS_PC;
@@ -155,6 +159,103 @@ kunwind_recover_return_address(struct kunwind_state *state)
 	return 0;
 }
 
+static __always_inline
+int kunwind_next_regs_pc(struct kunwind_state *state)
+{
+	struct stack_info *info;
+	unsigned long fp = state->common.fp;
+	struct pt_regs *regs;
+
+	regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp);
+
+	info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs));
+	if (!info)
+		return -EINVAL;
+
+	unwind_consume_stack(&state->common, info, (unsigned long)regs,
+			     sizeof(*regs));
+
+	state->regs = regs;
+	state->common.pc = regs->pc;
+	state->common.fp = regs->regs[29];
+	state->source = KUNWIND_SOURCE_REGS_PC;
+	return 0;
+}
+
+static __always_inline int
+kunwind_next_regs_lr(struct kunwind_state *state)
+{
+	/*
+	 * The stack for the regs was consumed by kunwind_next_regs_pc(), so we
+	 * cannot consume that again here, but we know the regs are safe to
+	 * access.
+	 */
+	state->common.pc = state->regs->regs[30];
+	state->common.fp = state->regs->regs[29];
+	state->regs = NULL;
+	state->source = KUNWIND_SOURCE_REGS_LR;
+
+	return 0;
+}
+
+static __always_inline int
+kunwind_next_frame_record_meta(struct kunwind_state *state)
+{
+	struct task_struct *tsk = state->task;
+	unsigned long fp = state->common.fp;
+	struct frame_record_meta *meta;
+	struct stack_info *info;
+
+	info = unwind_find_stack(&state->common, fp, sizeof(*meta));
+	if (!info)
+		return -EINVAL;
+
+	meta = (struct frame_record_meta *)fp;
+	switch (READ_ONCE(meta->type)) {
+	case FRAME_META_TYPE_FINAL:
+		if (meta == &task_pt_regs(tsk)->stackframe)
+			return -ENOENT;
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	case FRAME_META_TYPE_PT_REGS:
+		return kunwind_next_regs_pc(state);
+	default:
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+}
+
+static __always_inline int
+kunwind_next_frame_record(struct kunwind_state *state)
+{
+	unsigned long fp = state->common.fp;
+	struct frame_record *record;
+	struct stack_info *info;
+	unsigned long new_fp, new_pc;
+
+	if (fp & 0x7)
+		return -EINVAL;
+
+	info = unwind_find_stack(&state->common, fp, sizeof(*record));
+	if (!info)
+		return -EINVAL;
+
+	record = (struct frame_record *)fp;
+	new_fp = READ_ONCE(record->fp);
+	new_pc = READ_ONCE(record->lr);
+
+	if (!new_fp && !new_pc)
+		return kunwind_next_frame_record_meta(state);
+
+	unwind_consume_stack(&state->common, info, fp, sizeof(*record));
+
+	state->common.fp = new_fp;
+	state->common.pc = new_pc;
+	state->source = KUNWIND_SOURCE_FRAME;
+
+	return 0;
+}
+
 /*
  * Unwind from one frame record (A) to the next frame record (B).
  *
@@ -165,30 +266,27 @@ kunwind_recover_return_address(struct kunwind_state *state)
 static __always_inline int
 kunwind_next(struct kunwind_state *state)
 {
-	struct task_struct *tsk = state->task;
-	unsigned long fp = state->common.fp;
 	int err;
 
 	state->flags.all = 0;
 
-	/* Final frame; nothing to unwind */
-	if (fp == (unsigned long)&task_pt_regs(tsk)->stackframe)
-		return -ENOENT;
-
 	switch (state->source) {
 	case KUNWIND_SOURCE_FRAME:
 	case KUNWIND_SOURCE_CALLER:
 	case KUNWIND_SOURCE_TASK:
+	case KUNWIND_SOURCE_REGS_LR:
+		err = kunwind_next_frame_record(state);
+		break;
 	case KUNWIND_SOURCE_REGS_PC:
-		err = unwind_next_frame_record(&state->common);
-		if (err)
-			return err;
-		state->source = KUNWIND_SOURCE_FRAME;
+		err = kunwind_next_regs_lr(state);
 		break;
 	default:
-		return -EINVAL;
+		err = -EINVAL;
 	}
 
+	if (err)
+		return err;
+
 	state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc);
 
 	return kunwind_recover_return_address(state);
@@ -338,6 +436,7 @@ static const char *state_source_string(const struct kunwind_state *state)
 	case KUNWIND_SOURCE_CALLER:	return "C";
 	case KUNWIND_SOURCE_TASK:	return "T";
 	case KUNWIND_SOURCE_REGS_PC:	return "P";
+	case KUNWIND_SOURCE_REGS_LR:	return "L";
 	default:			return "U";
 	}
 }

From 4e6e8c2b757f382684abc4765202cd25c221dea1 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 4 Oct 2024 21:26:29 +0100
Subject: [PATCH 076/168] binfmt_elf: Wire up AT_HWCAP3 at AT_HWCAP4

AT_HWCAP3 and AT_HWCAP4 were recently defined for use on PowerPC in commit
3281366a8e79 ("uapi/auxvec: Define AT_HWCAP3 and AT_HWCAP4 aux vector,
entries"). Since we want to start using AT_HWCAP3 on arm64 add support for
exposing both these new hwcaps via binfmt_elf.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Kees Cook <kees@kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20241004-arm64-elf-hwcap3-v2-1-799d1daad8b0@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 fs/binfmt_elf.c        |  6 ++++++
 fs/binfmt_elf_fdpic.c  |  6 ++++++
 fs/compat_binfmt_elf.c | 10 ++++++++++
 3 files changed, 22 insertions(+)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06dc4a57ba78..3039a6b7aba4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -257,6 +257,12 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
 	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
 #ifdef ELF_HWCAP2
 	NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+#endif
+#ifdef ELF_HWCAP3
+	NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+#endif
+#ifdef ELF_HWCAP4
+	NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
 #endif
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 4fe5bb9f1b1f..31d253bd3961 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -623,6 +623,12 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	NEW_AUX_ENT(AT_HWCAP,	ELF_HWCAP);
 #ifdef ELF_HWCAP2
 	NEW_AUX_ENT(AT_HWCAP2,	ELF_HWCAP2);
+#endif
+#ifdef ELF_HWCAP3
+	NEW_AUX_ENT(AT_HWCAP3,	ELF_HWCAP3);
+#endif
+#ifdef ELF_HWCAP4
+	NEW_AUX_ENT(AT_HWCAP4,	ELF_HWCAP4);
 #endif
 	NEW_AUX_ENT(AT_PAGESZ,	PAGE_SIZE);
 	NEW_AUX_ENT(AT_CLKTCK,	CLOCKS_PER_SEC);
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 8f0af4f62631..d5ef5469e4e6 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -80,6 +80,16 @@
 #define	ELF_HWCAP2		COMPAT_ELF_HWCAP2
 #endif
 
+#ifdef	COMPAT_ELF_HWCAP3
+#undef	ELF_HWCAP3
+#define	ELF_HWCAP3		COMPAT_ELF_HWCAP3
+#endif
+
+#ifdef	COMPAT_ELF_HWCAP4
+#undef	ELF_HWCAP4
+#define	ELF_HWCAP4		COMPAT_ELF_HWCAP4
+#endif
+
 #ifdef	COMPAT_ARCH_DLINFO
 #undef	ARCH_DLINFO
 #define	ARCH_DLINFO		COMPAT_ARCH_DLINFO

From ddadbcdaaed5c3c44cc6c36093f6bf02d942d71d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 4 Oct 2024 21:26:30 +0100
Subject: [PATCH 077/168] arm64: Support AT_HWCAP3

We have filled all 64 bits of AT_HWCAP2 so in order to support discovery of
further features provide the framework to use the already defined AT_HWCAP3
for further CPU features.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20241004-arm64-elf-hwcap3-v2-2-799d1daad8b0@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/elf_hwcaps.rst | 6 +++---
 arch/arm64/include/asm/cpufeature.h     | 3 ++-
 arch/arm64/include/asm/hwcap.h          | 6 +++++-
 arch/arm64/include/uapi/asm/hwcap.h     | 4 ++++
 arch/arm64/kernel/cpufeature.c          | 6 ++++++
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index 694f67fa07d1..dc1b11d6d112 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -16,9 +16,9 @@ architected discovery mechanism available to userspace code at EL0. The
 kernel exposes the presence of these features to userspace through a set
 of flags called hwcaps, exposed in the auxiliary vector.
 
-Userspace software can test for features by acquiring the AT_HWCAP or
-AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant
-flags are set, e.g.::
+Userspace software can test for features by acquiring the AT_HWCAP,
+AT_HWCAP2 or AT_HWCAP3 entry of the auxiliary vector, and testing
+whether the relevant flags are set, e.g.::
 
 	bool floating_point_is_present(void)
 	{
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 3d261cc123c1..38e7d1a44ea3 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
 #include <asm/hwcap.h>
 #include <asm/sysreg.h>
 
-#define MAX_CPU_FEATURES	128
+#define MAX_CPU_FEATURES	192
 #define cpu_feature(x)		KERNEL_HWCAP_ ## x
 
 #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR	0
@@ -438,6 +438,7 @@ void cpu_set_feature(unsigned int num);
 bool cpu_have_feature(unsigned int num);
 unsigned long cpu_get_elf_hwcap(void);
 unsigned long cpu_get_elf_hwcap2(void);
+unsigned long cpu_get_elf_hwcap3(void);
 
 #define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name))
 #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index a775adddecf2..3b5c50df419e 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -159,17 +159,21 @@
 #define KERNEL_HWCAP_SME_SF8DP2		__khwcap2_feature(SME_SF8DP2)
 #define KERNEL_HWCAP_POE		__khwcap2_feature(POE)
 
+#define __khwcap3_feature(x)		(const_ilog2(HWCAP3_ ## x) + 128)
+
 /*
  * This yields a mask that user programs can use to figure out what
  * instruction set this cpu supports.
  */
 #define ELF_HWCAP		cpu_get_elf_hwcap()
 #define ELF_HWCAP2		cpu_get_elf_hwcap2()
+#define ELF_HWCAP3		cpu_get_elf_hwcap3()
 
 #ifdef CONFIG_COMPAT
 #define COMPAT_ELF_HWCAP	(compat_elf_hwcap)
 #define COMPAT_ELF_HWCAP2	(compat_elf_hwcap2)
-extern unsigned int compat_elf_hwcap, compat_elf_hwcap2;
+#define COMPAT_ELF_HWCAP3	(compat_elf_hwcap3)
+extern unsigned int compat_elf_hwcap, compat_elf_hwcap2, compat_elf_hwcap3;
 #endif
 
 enum {
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 055381b2c615..3dd4a53a438a 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -124,4 +124,8 @@
 #define HWCAP2_SME_SF8DP2	(1UL << 62)
 #define HWCAP2_POE		(1UL << 63)
 
+/*
+ * HWCAP3 flags - for AT_HWCAP3
+ */
+
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 718728a85430..7221636b7907 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -103,6 +103,7 @@ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly;
 				 COMPAT_HWCAP_LPAE)
 unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
 unsigned int compat_elf_hwcap2 __read_mostly;
+unsigned int compat_elf_hwcap3 __read_mostly;
 #endif
 
 DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS);
@@ -3499,6 +3500,11 @@ unsigned long cpu_get_elf_hwcap2(void)
 	return elf_hwcap[1];
 }
 
+unsigned long cpu_get_elf_hwcap3(void)
+{
+	return elf_hwcap[2];
+}
+
 static void __init setup_boot_cpu_capabilities(void)
 {
 	/*

From a2aa5dcc6393dc08844a3f76aa5b7694fbbf99c8 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:44 +0100
Subject: [PATCH 078/168] kselftest/arm64: signal: drop now redundant
 GNU_SOURCE definition

The definition of GNU_SOURCE was recently centralised in an upper layer
kselftest Makefile, so the definition in the arm64 signal tests Makefile
is no longer needed. To make things worse, since both definitions are
not strictly identical, the compiler warns about it:
<command-line>: warning: "_GNU_SOURCE" redefined
<command-line>: note: this is the location of the previous definition

Drop the definition in the arm64/signal Makefile.

Fixes: cc937dad85ae ("selftests: centralize -D_GNU_SOURCE= to CFLAGS in lib.mk")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-2-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile
index edb3613513b8..1381039fb36f 100644
--- a/tools/testing/selftests/arm64/signal/Makefile
+++ b/tools/testing/selftests/arm64/signal/Makefile
@@ -2,7 +2,7 @@
 # Copyright (C) 2019 ARM Limited
 
 # Additional include paths needed by kselftest.h and local headers
-CFLAGS += -D_GNU_SOURCE -std=gnu99 -I.
+CFLAGS += -std=gnu99 -I.
 
 SRCS := $(filter-out testcases/testcases.c,$(wildcard testcases/*.c))
 PROGS := $(patsubst %.c,%,$(SRCS))

From b0d80dbc378d52155c9ecf9579986edccceed3aa Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:45 +0100
Subject: [PATCH 079/168] kselftest/arm64: hwcap: fix f8dp2 cpuinfo name

The F8DP2 DPISA extension has a separate cpuinfo field, named
accordingly.
Change the erroneously placed name of "f8dp4" to "f8dp2".

Fixes: 44d10c27bd75 ("kselftest/arm64: Add 2023 DPISA hwcap test coverage")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-3-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index f2d6007a2b98..7e95ba5fd496 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -490,7 +490,7 @@ static const struct hwcap_data {
 		.name = "F8DP2",
 		.at_hwcap = AT_HWCAP2,
 		.hwcap_bit = HWCAP2_F8DP2,
-		.cpuinfo = "f8dp4",
+		.cpuinfo = "f8dp2",
 		.sigill_fn = f8dp2_sigill,
 	},
 	{

From bf52ca5912c07664276c7b94db820fa2d638b681 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:46 +0100
Subject: [PATCH 080/168] kselftest/arm64: mte: use proper SKIP syntax

If MTE is not available on a system, we detect this early and skip all
the MTE selftests. However this happens before we print the TAP plan, so
tools parsing the TAP output get confused and report an error.

Use the existing ksft_exit_skip() function to handle this, which uses a
dummy plan to work with tools expecting proper TAP syntax, as described
in the TAP specification.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-4-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/mte_common_util.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 00ffd34c66d3..69e4a67853c4 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -319,10 +319,9 @@ int mte_default_setup(void)
 	unsigned long en = 0;
 	int ret;
 
-	if (!(hwcaps2 & HWCAP2_MTE)) {
-		ksft_print_msg("SKIP: MTE features unavailable\n");
-		return KSFT_SKIP;
-	}
+	if (!(hwcaps2 & HWCAP2_MTE))
+		ksft_exit_skip("MTE features unavailable\n");
+
 	/* Get current mte mode */
 	ret = prctl(PR_GET_TAGGED_ADDR_CTRL, en, 0, 0, 0);
 	if (ret < 0) {

From 0f995f22a03fef8f3bff51d22a0a78c768536814 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:47 +0100
Subject: [PATCH 081/168] kselftest/arm64: mte: use string literal for
 printf-style functions

Using pointers for the format specifier strings in printf-style
functions can create potential security problems, as the number of
arguments to be parsed could vary from call to call. Most compilers
consequently warn about those:
"format not a string literal and no format arguments [-Wformat-security]"

If we only want to print a constant string, we can just use a fixed "%s"
format instead, and pass the string as an argument.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-5-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/mte_common_util.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.h b/tools/testing/selftests/arm64/mte/mte_common_util.h
index 2d3e71724e55..a0017a303beb 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.h
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.h
@@ -77,13 +77,13 @@ static inline void evaluate_test(int err, const char *msg)
 {
 	switch (err) {
 	case KSFT_PASS:
-		ksft_test_result_pass(msg);
+		ksft_test_result_pass("%s", msg);
 		break;
 	case KSFT_FAIL:
-		ksft_test_result_fail(msg);
+		ksft_test_result_fail("%s", msg);
 		break;
 	case KSFT_SKIP:
-		ksft_test_result_skip(msg);
+		ksft_test_result_skip("%s", msg);
 		break;
 	default:
 		ksft_test_result_error("Unknown return code %d from %s",

From 7e893dc81de3e342156389ea0b83ec7d07f25281 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:49 +0100
Subject: [PATCH 082/168] kselftest/arm64: mte: fix printf type warnings about
 __u64

When printing the signal context's PC, we use a "%lx" format specifier,
which matches the common userland (glibc's) definition of uint64_t as an
"unsigned long". However the structure in question is defined in a
kernel uapi header, which uses a self defined __u64 type, and the arm64
kernel headers define this using "int-ll64.h", so it becomes an
"unsigned long long". This mismatch leads to the usual compiler warning.

The common fix would be to use "PRIx64", but because this is defined by
the userland's toolchain libc headers, it wouldn't match as well. Since
we know the exact type of __u64, just use "%llx" here instead, to silence
this warning.

This also fixes a more severe typo: "$lx" is not a valid format
specifier.

Fixes: 191e678bdc9b ("kselftest/arm64: Log unexpected asynchronous MTE faults")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-7-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/mte_common_util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 69e4a67853c4..9380edca29c7 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -38,7 +38,7 @@ void mte_default_handler(int signum, siginfo_t *si, void *uc)
 			if (cur_mte_cxt.trig_si_code == si->si_code)
 				cur_mte_cxt.fault_valid = true;
 			else
-				ksft_print_msg("Got unexpected SEGV_MTEAERR at pc=$lx, fault addr=%lx\n",
+				ksft_print_msg("Got unexpected SEGV_MTEAERR at pc=%llx, fault addr=%lx\n",
 					       ((ucontext_t *)uc)->uc_mcontext.pc,
 					       addr);
 			return;
@@ -64,7 +64,7 @@ void mte_default_handler(int signum, siginfo_t *si, void *uc)
 			exit(1);
 		}
 	} else if (signum == SIGBUS) {
-		ksft_print_msg("INFO: SIGBUS signal at pc=%lx, fault addr=%lx, si_code=%lx\n",
+		ksft_print_msg("INFO: SIGBUS signal at pc=%llx, fault addr=%lx, si_code=%x\n",
 				((ucontext_t *)uc)->uc_mcontext.pc, addr, si->si_code);
 		if ((cur_mte_cxt.trig_range >= 0 &&
 		     addr >= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) &&

From 4716f719202e900b52f5f2270ac16b6a8ae40a47 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:50 +0100
Subject: [PATCH 083/168] kselftest/arm64: mte: fix printf type warnings about
 pointers

When printing the value of a pointer, we should not use an integer
format specifier, but the dedicated "%p" instead.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-8-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/check_buffer_fill.c | 4 ++--
 tools/testing/selftests/arm64/mte/mte_common_util.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/arm64/mte/check_buffer_fill.c b/tools/testing/selftests/arm64/mte/check_buffer_fill.c
index 1dbbbd47dd50..2ee7f114d7fa 100644
--- a/tools/testing/selftests/arm64/mte/check_buffer_fill.c
+++ b/tools/testing/selftests/arm64/mte/check_buffer_fill.c
@@ -91,7 +91,7 @@ static int check_buffer_underflow_by_byte(int mem_type, int mode,
 		for (j = 0; j < sizes[i]; j++) {
 			if (ptr[j] != '1') {
 				err = true;
-				ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n",
+				ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%p\n",
 						j, ptr);
 				break;
 			}
@@ -189,7 +189,7 @@ static int check_buffer_overflow_by_byte(int mem_type, int mode,
 		for (j = 0; j < sizes[i]; j++) {
 			if (ptr[j] != '1') {
 				err = true;
-				ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n",
+				ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%p\n",
 						j, ptr);
 				break;
 			}
diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 9380edca29c7..17fbe5cfe472 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -100,7 +100,7 @@ void *mte_insert_tags(void *ptr, size_t size)
 	int align_size;
 
 	if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) {
-		ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr);
+		ksft_print_msg("FAIL: Addr=%p: invalid\n", ptr);
 		return NULL;
 	}
 	align_size = MT_ALIGN_UP(size);
@@ -112,7 +112,7 @@ void *mte_insert_tags(void *ptr, size_t size)
 void mte_clear_tags(void *ptr, size_t size)
 {
 	if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) {
-		ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr);
+		ksft_print_msg("FAIL: Addr=%p: invalid\n", ptr);
 		return;
 	}
 	size = MT_ALIGN_UP(size);

From 96dddb7b9406259baace9a1831e8da155311be6f Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Fri, 16 Aug 2024 16:32:51 +0100
Subject: [PATCH 084/168] kselftest/arm64: mte: fix printf type warnings about
 longs

When checking MTE tags, we print some diagnostic messages when the tests
fail. Some variables uses there are "longs", however we only use "%x"
for the format specifier.

Update the format specifiers to "%lx", to match the variable types they
are supposed to print.

Fixes: f3b2a26ca78d ("kselftest/arm64: Verify mte tag inclusion via prctl")
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20240816153251.2833702-9-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/check_tags_inclusion.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
index 2b1425b92b69..a3d1e23fe02a 100644
--- a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
+++ b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
@@ -65,7 +65,7 @@ static int check_single_included_tags(int mem_type, int mode)
 			ptr = mte_insert_tags(ptr, BUFFER_SIZE);
 			/* Check tag value */
 			if (MT_FETCH_TAG((uintptr_t)ptr) == tag) {
-				ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n",
+				ksft_print_msg("FAIL: wrong tag = 0x%lx with include mask=0x%x\n",
 					       MT_FETCH_TAG((uintptr_t)ptr),
 					       MT_INCLUDE_VALID_TAG(tag));
 				result = KSFT_FAIL;
@@ -97,7 +97,7 @@ static int check_multiple_included_tags(int mem_type, int mode)
 			ptr = mte_insert_tags(ptr, BUFFER_SIZE);
 			/* Check tag value */
 			if (MT_FETCH_TAG((uintptr_t)ptr) < tag) {
-				ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n",
+				ksft_print_msg("FAIL: wrong tag = 0x%lx with include mask=0x%lx\n",
 					       MT_FETCH_TAG((uintptr_t)ptr),
 					       MT_INCLUDE_VALID_TAGS(excl_mask));
 				result = KSFT_FAIL;

From fc7454107d1b7c27bb98d3b109e5f44a8a46d7f8 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 18 Oct 2024 09:53:49 +0200
Subject: [PATCH 085/168] arm64/lib: Handle CRC-32 alternative in C code

In preparation for adding another code path for performing CRC-32, move
the alternative patching for ARM64_HAS_CRC32 into C code. The logic for
deciding whether to use this new code path will be implemented in C too.

Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20241018075347.2821102-6-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/lib/Makefile     |  2 +-
 arch/arm64/lib/crc32-glue.c | 34 ++++++++++++++++++++++++++++++++++
 arch/arm64/lib/crc32.S      | 22 ++++++----------------
 3 files changed, 41 insertions(+), 17 deletions(-)
 create mode 100644 arch/arm64/lib/crc32-glue.c

diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 13e6a2829116..8e882f479d98 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -13,7 +13,7 @@ endif
 
 lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
 
-obj-$(CONFIG_CRC32) += crc32.o
+obj-$(CONFIG_CRC32) += crc32.o crc32-glue.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
new file mode 100644
index 000000000000..0b51761d4b75
--- /dev/null
+++ b/arch/arm64/lib/crc32-glue.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crc32.h>
+#include <linux/linkage.h>
+
+#include <asm/alternative.h>
+
+asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return crc32_le_base(crc, p, len);
+
+	return crc32_le_arm64(crc, p, len);
+}
+
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return __crc32c_le_base(crc, p, len);
+
+	return crc32c_le_arm64(crc, p, len);
+}
+
+u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+{
+	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+		return crc32_be_base(crc, p, len);
+
+	return crc32_be_arm64(crc, p, len);
+}
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 8340dccff46f..22139691c7ae 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -6,7 +6,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/alternative.h>
 #include <asm/assembler.h>
 
 	.arch		armv8-a+crc
@@ -136,25 +135,16 @@ CPU_BE( rev16		\reg, \reg	)
 	.endm
 
 	.align		5
-SYM_FUNC_START(crc32_le)
-alternative_if_not ARM64_HAS_CRC32
-	b		crc32_le_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32_le_arm64)
 	__crc32
-SYM_FUNC_END(crc32_le)
+SYM_FUNC_END(crc32_le_arm64)
 
 	.align		5
-SYM_FUNC_START(__crc32c_le)
-alternative_if_not ARM64_HAS_CRC32
-	b		__crc32c_le_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32c_le_arm64)
 	__crc32		c
-SYM_FUNC_END(__crc32c_le)
+SYM_FUNC_END(crc32c_le_arm64)
 
 	.align		5
-SYM_FUNC_START(crc32_be)
-alternative_if_not ARM64_HAS_CRC32
-	b		crc32_be_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32_be_arm64)
 	__crc32		be=1
-SYM_FUNC_END(crc32_be)
+SYM_FUNC_END(crc32_be_arm64)

From b98b23e19492f4009070761c53b755f623f60e49 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 18 Oct 2024 09:53:50 +0200
Subject: [PATCH 086/168] arm64/crc32: Reorganize bit/byte ordering macros

In preparation for a new user, reorganize the bit/byte ordering macros
that are used to parameterize the crc32 template code and instantiate
CRC-32, CRC-32c and 'big endian' CRC-32.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20241018075347.2821102-7-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/lib/crc32.S | 93 ++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 53 deletions(-)

diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 22139691c7ae..f9920492f135 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -10,44 +10,48 @@
 
 	.arch		armv8-a+crc
 
-	.macro		byteorder, reg, be
-	.if		\be
-CPU_LE( rev		\reg, \reg	)
-	.else
-CPU_BE( rev		\reg, \reg	)
-	.endif
+	.macro		bitle, reg
 	.endm
 
-	.macro		byteorder16, reg, be
-	.if		\be
-CPU_LE( rev16		\reg, \reg	)
-	.else
-CPU_BE( rev16		\reg, \reg	)
-	.endif
-	.endm
-
-	.macro		bitorder, reg, be
-	.if		\be
+	.macro		bitbe, reg
 	rbit		\reg, \reg
-	.endif
 	.endm
 
-	.macro		bitorder16, reg, be
-	.if		\be
-	rbit		\reg, \reg
-	lsr		\reg, \reg, #16
-	.endif
+	.macro		bytele, reg
 	.endm
 
-	.macro		bitorder8, reg, be
-	.if		\be
+	.macro		bytebe, reg
 	rbit		\reg, \reg
 	lsr		\reg, \reg, #24
-	.endif
 	.endm
 
-	.macro		__crc32, c, be=0
-	bitorder	w0, \be
+	.macro		hwordle, reg
+CPU_BE(	rev16		\reg, \reg	)
+	.endm
+
+	.macro		hwordbe, reg
+CPU_LE(	rev		\reg, \reg	)
+	rbit		\reg, \reg
+CPU_BE(	lsr		\reg, \reg, #16	)
+	.endm
+
+	.macro		le, regs:vararg
+	.irp		r, \regs
+CPU_BE(	rev		\r, \r		)
+	.endr
+	.endm
+
+	.macro		be, regs:vararg
+	.irp		r, \regs
+CPU_LE(	rev		\r, \r		)
+	.endr
+	.irp		r, \regs
+	rbit		\r, \r
+	.endr
+	.endm
+
+	.macro		__crc32, c, order=le
+	bit\order	w0
 	cmp		x2, #16
 	b.lt		8f			// less than 16 bytes
 
@@ -60,14 +64,7 @@ CPU_BE( rev16		\reg, \reg	)
 	add		x8, x8, x1
 	add		x1, x1, x7
 	ldp		x5, x6, [x8]
-	byteorder	x3, \be
-	byteorder	x4, \be
-	byteorder	x5, \be
-	byteorder	x6, \be
-	bitorder	x3, \be
-	bitorder	x4, \be
-	bitorder	x5, \be
-	bitorder	x6, \be
+	\order		x3, x4, x5, x6
 
 	tst		x7, #8
 	crc32\c\()x	w8, w0, x3
@@ -95,42 +92,32 @@ CPU_BE( rev16		\reg, \reg	)
 32:	ldp		x3, x4, [x1], #32
 	sub		x2, x2, #32
 	ldp		x5, x6, [x1, #-16]
-	byteorder	x3, \be
-	byteorder	x4, \be
-	byteorder	x5, \be
-	byteorder	x6, \be
-	bitorder	x3, \be
-	bitorder	x4, \be
-	bitorder	x5, \be
-	bitorder	x6, \be
+	\order		x3, x4, x5, x6
 	crc32\c\()x	w0, w0, x3
 	crc32\c\()x	w0, w0, x4
 	crc32\c\()x	w0, w0, x5
 	crc32\c\()x	w0, w0, x6
 	cbnz		x2, 32b
-0:	bitorder	w0, \be
+0:	bit\order	w0
 	ret
 
 8:	tbz		x2, #3, 4f
 	ldr		x3, [x1], #8
-	byteorder	x3, \be
-	bitorder	x3, \be
+	\order		x3
 	crc32\c\()x	w0, w0, x3
 4:	tbz		x2, #2, 2f
 	ldr		w3, [x1], #4
-	byteorder	w3, \be
-	bitorder	w3, \be
+	\order		w3
 	crc32\c\()w	w0, w0, w3
 2:	tbz		x2, #1, 1f
 	ldrh		w3, [x1], #2
-	byteorder16	w3, \be
-	bitorder16	w3, \be
+	hword\order	w3
 	crc32\c\()h	w0, w0, w3
 1:	tbz		x2, #0, 0f
 	ldrb		w3, [x1]
-	bitorder8	w3, \be
+	byte\order	w3
 	crc32\c\()b	w0, w0, w3
-0:	bitorder	w0, \be
+0:	bit\order	w0
 	ret
 	.endm
 
@@ -146,5 +133,5 @@ SYM_FUNC_END(crc32c_le_arm64)
 
 	.align		5
 SYM_FUNC_START(crc32_be_arm64)
-	__crc32		be=1
+	__crc32		order=be
 SYM_FUNC_END(crc32_be_arm64)

From a6478d69cf56d5deb4c28a6486376d9c7895abec Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 18 Oct 2024 09:53:51 +0200
Subject: [PATCH 087/168] arm64/crc32: Implement 4-way interleave using PMULL

Now that kernel mode NEON no longer disables preemption, using FP/SIMD
in library code which is not obviously part of the crypto subsystem is
no longer problematic, as it will no longer incur unexpected latencies.

So accelerate the CRC-32 library code on arm64 to use a 4-way
interleave, using PMULL instructions to implement the folding.

On Apple M2, this results in a speedup of 2 - 2.8x when using input
sizes of 1k - 8k. For smaller sizes, the overhead of preserving and
restoring the FP/SIMD register file may not be worth it, so 1k is used
as a threshold for choosing this code path.

The coefficient tables were generated using code provided by Eric. [0]

[0] https://github.com/ebiggers/libdeflate/blob/master/scripts/gen_crc32_multipliers.c

Cc: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20241018075347.2821102-8-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/lib/crc32-glue.c |  48 ++++++++
 arch/arm64/lib/crc32.S      | 231 +++++++++++++++++++++++++++++++++++-
 2 files changed, 276 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
index 0b51761d4b75..295ae3e6b997 100644
--- a/arch/arm64/lib/crc32-glue.c
+++ b/arch/arm64/lib/crc32-glue.c
@@ -4,16 +4,40 @@
 #include <linux/linkage.h>
 
 #include <asm/alternative.h>
+#include <asm/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#include <crypto/internal/simd.h>
+
+// The minimum input length to consider the 4-way interleaved code path
+static const size_t min_len = 1024;
 
 asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
 asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
 asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
 
+asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+
 u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
 {
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32_le_base(crc, p, len);
 
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32_le_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
 	return crc32_le_arm64(crc, p, len);
 }
 
@@ -22,6 +46,18 @@ u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return __crc32c_le_base(crc, p, len);
 
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32c_le_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
 	return crc32c_le_arm64(crc, p, len);
 }
 
@@ -30,5 +66,17 @@ u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
 	if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
 		return crc32_be_base(crc, p, len);
 
+	if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+		kernel_neon_begin();
+		crc = crc32_be_arm64_4way(crc, p, len);
+		kernel_neon_end();
+
+		p += round_down(len, 64);
+		len %= 64;
+
+		if (!len)
+			return crc;
+	}
+
 	return crc32_be_arm64(crc, p, len);
 }
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index f9920492f135..68825317460f 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -1,14 +1,17 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Accelerated CRC32(C) using AArch64 CRC instructions
+ * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
  *
- * Copyright (C) 2016 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016 - 2018 Linaro Ltd.
+ * Copyright (C) 2024 Google LLC
+ *
+ * Author: Ard Biesheuvel <ardb@kernel.org>
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-	.arch		armv8-a+crc
+	.cpu		generic+crc+crypto
 
 	.macro		bitle, reg
 	.endm
@@ -135,3 +138,225 @@ SYM_FUNC_END(crc32c_le_arm64)
 SYM_FUNC_START(crc32_be_arm64)
 	__crc32		order=be
 SYM_FUNC_END(crc32_be_arm64)
+
+	in		.req	x1
+	len		.req	x2
+
+	/*
+	 * w0: input CRC at entry, output CRC at exit
+	 * x1: pointer to input buffer
+	 * x2: length of input in bytes
+	 */
+	.macro		crc4way, insn, table, order=le
+	bit\order	w0
+	lsr		len, len, #6		// len := # of 64-byte blocks
+
+	/* Process up to 64 blocks of 64 bytes at a time */
+.La\@:	mov		x3, #64
+	cmp		len, #64
+	csel		x3, x3, len, hi		// x3 := min(len, 64)
+	sub		len, len, x3
+
+	/* Divide the input into 4 contiguous blocks */
+	add		x4, x3, x3, lsl #1	// x4 :=  3 * x3
+	add		x7, in, x3, lsl #4	// x7 := in + 16 * x3
+	add		x8, in, x3, lsl #5	// x8 := in + 32 * x3
+	add		x9, in, x4, lsl #4	// x9 := in + 16 * x4
+
+	/* Load the folding coefficients from the lookup table */
+	adr_l		x5, \table - 12		// entry 0 omitted
+	add		x5, x5, x4, lsl #2	// x5 += 12 * x3
+	ldp		s0, s1, [x5]
+	ldr		s2, [x5, #8]
+
+	/* Zero init partial CRCs for this iteration */
+	mov		w4, wzr
+	mov		w5, wzr
+	mov		w6, wzr
+	mov		x17, xzr
+
+.Lb\@:	sub		x3, x3, #1
+	\insn		w6, w6, x17
+	ldp		x10, x11, [in], #16
+	ldp		x12, x13, [x7], #16
+	ldp		x14, x15, [x8], #16
+	ldp		x16, x17, [x9], #16
+
+	\order		x10, x11, x12, x13, x14, x15, x16, x17
+
+	/* Apply the CRC transform to 4 16-byte blocks in parallel */
+	\insn		w0, w0, x10
+	\insn		w4, w4, x12
+	\insn		w5, w5, x14
+	\insn		w6, w6, x16
+	\insn		w0, w0, x11
+	\insn		w4, w4, x13
+	\insn		w5, w5, x15
+	cbnz		x3, .Lb\@
+
+	/* Combine the 4 partial results into w0 */
+	mov		v3.d[0], x0
+	mov		v4.d[0], x4
+	mov		v5.d[0], x5
+	pmull		v0.1q, v0.1d, v3.1d
+	pmull		v1.1q, v1.1d, v4.1d
+	pmull		v2.1q, v2.1d, v5.1d
+	eor		v0.8b, v0.8b, v1.8b
+	eor		v0.8b, v0.8b, v2.8b
+	mov		x5, v0.d[0]
+	eor		x5, x5, x17
+	\insn		w0, w6, x5
+
+	mov		in, x9
+	cbnz		len, .La\@
+
+	bit\order	w0
+	ret
+	.endm
+
+	.align		5
+SYM_FUNC_START(crc32c_le_arm64_4way)
+	crc4way		crc32cx, .L0
+SYM_FUNC_END(crc32c_le_arm64_4way)
+
+	.align		5
+SYM_FUNC_START(crc32_le_arm64_4way)
+	crc4way		crc32x, .L1
+SYM_FUNC_END(crc32_le_arm64_4way)
+
+	.align		5
+SYM_FUNC_START(crc32_be_arm64_4way)
+	crc4way		crc32x, .L1, be
+SYM_FUNC_END(crc32_be_arm64_4way)
+
+	.section	.rodata, "a", %progbits
+	.align		6
+.L0:	.long		0xddc0152b, 0xba4fc28e, 0x493c7d27
+	.long		0x0715ce53, 0x9e4addf8, 0xba4fc28e
+	.long		0xc96cfdc0, 0x0715ce53, 0xddc0152b
+	.long		0xab7aff2a, 0x0d3b6092, 0x9e4addf8
+	.long		0x299847d5, 0x878a92a7, 0x39d3b296
+	.long		0xb6dd949b, 0xab7aff2a, 0x0715ce53
+	.long		0xa60ce07b, 0x83348832, 0x47db8317
+	.long		0xd270f1a2, 0xb9e02b86, 0x0d3b6092
+	.long		0x65863b64, 0xb6dd949b, 0xc96cfdc0
+	.long		0xb3e32c28, 0xbac2fd7b, 0x878a92a7
+	.long		0xf285651c, 0xce7f39f4, 0xdaece73e
+	.long		0x271d9844, 0xd270f1a2, 0xab7aff2a
+	.long		0x6cb08e5c, 0x2b3cac5d, 0x2162d385
+	.long		0xcec3662e, 0x1b03397f, 0x83348832
+	.long		0x8227bb8a, 0xb3e32c28, 0x299847d5
+	.long		0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
+	.long		0xf6076544, 0x10746f3c, 0x18b33a4e
+	.long		0x98d8d9cb, 0x271d9844, 0xb6dd949b
+	.long		0x57a3d037, 0x93a5f730, 0x78d9ccb7
+	.long		0x3771e98f, 0x6b749fb2, 0xbac2fd7b
+	.long		0xe0ac139e, 0xcec3662e, 0xa60ce07b
+	.long		0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
+	.long		0xa2b73df1, 0xb0cd4768, 0x61d82e56
+	.long		0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
+	.long		0xa90fd27a, 0x0167d312, 0xc619809d
+	.long		0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
+	.long		0x4597456a, 0x98d8d9cb, 0x65863b64
+	.long		0xc9c8b782, 0x68bce87a, 0x1b03397f
+	.long		0x62ec6c6d, 0x6956fc3b, 0xebb883bd
+	.long		0x2342001e, 0x3771e98f, 0xb3e32c28
+	.long		0xe8b6368b, 0x2178513a, 0x064f7f26
+	.long		0x9ef68d35, 0x170076fa, 0xdd7e3b0c
+	.long		0x0b0bf8ca, 0x6f345e45, 0xf285651c
+	.long		0x02ee03b2, 0xff0dba97, 0x10746f3c
+	.long		0x135c83fd, 0xf872e54c, 0xc7a68855
+	.long		0x00bcf5f6, 0x86d8e4d2, 0x271d9844
+	.long		0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
+	.long		0xded288f8, 0xb3af077a, 0x93a5f730
+	.long		0x37170390, 0xca6ef3ac, 0x6cb08e5c
+	.long		0xf48642e9, 0xdd66cbbb, 0x6b749fb2
+	.long		0xb25b29f2, 0xe9e28eb4, 0x1393e203
+	.long		0x45cddf4e, 0xc9c8b782, 0xcec3662e
+	.long		0xdfd94fb2, 0x93e106a4, 0x96c515bb
+	.long		0x021ac5ef, 0xd813b325, 0xe6fc4e6a
+	.long		0x8e1450f7, 0x2342001e, 0x8227bb8a
+	.long		0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
+	.long		0x613eee91, 0xd2c3ed1a, 0x39c7ff35
+	.long		0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
+	.long		0x0cd1526a, 0xf2271e60, 0x0ab3844b
+	.long		0xd6c3a807, 0x2664fd8b, 0x0167d312
+	.long		0x1d31175f, 0x02ee03b2, 0xf6076544
+	.long		0x4be7fd90, 0x363bd6b3, 0x26f6a60a
+	.long		0x6eeed1c9, 0x5fabe670, 0xa741c1bf
+	.long		0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
+	.long		0x2e7d11a7, 0x17f27698, 0x49c3cc9c
+	.long		0x889774e1, 0xaa7c7ad5, 0x68bce87a
+	.long		0x8a074012, 0xded288f8, 0x57a3d037
+	.long		0xbd0bb25f, 0x6d390dec, 0x6956fc3b
+	.long		0x3be3c09b, 0x6353c1cc, 0x42d98888
+	.long		0x465a4eee, 0xf48642e9, 0x3771e98f
+	.long		0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
+	.long		0xa52f58ec, 0x9a5ede41, 0x2178513a
+	.long		0x47972100, 0x45cddf4e, 0xe0ac139e
+	.long		0x359674f7, 0xa51b6135, 0x170076fa
+
+.L1:	.long		0xaf449247, 0x81256527, 0xccaa009e
+	.long		0x57c54819, 0x1d9513d7, 0x81256527
+	.long		0x3f41287a, 0x57c54819, 0xaf449247
+	.long		0xf5e48c85, 0x910eeec1, 0x1d9513d7
+	.long		0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
+	.long		0x71d54a59, 0xf5e48c85, 0x57c54819
+	.long		0x1c63267b, 0xfe807bbd, 0x0cbec0ed
+	.long		0xd31343ea, 0xe95c1271, 0x910eeec1
+	.long		0xf9d9c7ee, 0x71d54a59, 0x3f41287a
+	.long		0x9ee62949, 0xcec97417, 0x9026d5b1
+	.long		0xa55d1514, 0xf183c71b, 0xd1df2327
+	.long		0x21aa2b26, 0xd31343ea, 0xf5e48c85
+	.long		0x9d842b80, 0xeea395c4, 0x3c656ced
+	.long		0xd8110ff1, 0xcd669a40, 0xfe807bbd
+	.long		0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
+	.long		0x1d6708a0, 0x0c30f51d, 0xe95c1271
+	.long		0xef82aa68, 0xdb3935ea, 0xb918a347
+	.long		0xd14bcc9b, 0x21aa2b26, 0x71d54a59
+	.long		0x99cce860, 0x356d209f, 0xff6f2fc2
+	.long		0xd8af8e46, 0xc352f6de, 0xcec97417
+	.long		0xf1996890, 0xd8110ff1, 0x1c63267b
+	.long		0x631bc508, 0xe95c7216, 0xf183c71b
+	.long		0x8511c306, 0x8e031a19, 0x9b9bdbd0
+	.long		0xdb3839f3, 0x1d6708a0, 0xd31343ea
+	.long		0x7a92fffb, 0xf7003835, 0x4470ac44
+	.long		0x6ce68f2a, 0x00eba0c8, 0xeea395c4
+	.long		0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
+	.long		0xb46f7cff, 0x9a1b53c8, 0xcd669a40
+	.long		0x60290934, 0x81b6f443, 0x6d40f445
+	.long		0x8e976a7d, 0xd8af8e46, 0x9ee62949
+	.long		0xdcf5088a, 0x9dbdc100, 0x145575d5
+	.long		0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
+	.long		0x255b139e, 0x631bc508, 0xa55d1514
+	.long		0xd784eaa8, 0xce26786c, 0xdb3935ea
+	.long		0x6d2c864a, 0x8068c345, 0x2586d334
+	.long		0x02072e24, 0xdb3839f3, 0x21aa2b26
+	.long		0x06689b0a, 0x5efd72f5, 0xe0575528
+	.long		0x1e52f5ea, 0x4117915b, 0x356d209f
+	.long		0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
+	.long		0x3796455c, 0xb8e0e4a8, 0xc352f6de
+	.long		0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
+	.long		0x28ae0976, 0xb46f7cff, 0xd8110ff1
+	.long		0x9764bc8d, 0xd7e7a22c, 0x712510f0
+	.long		0x13a13e18, 0x3e9a43cd, 0xe95c7216
+	.long		0xb8ee242e, 0x8e976a7d, 0x3f9e9356
+	.long		0x0c540e7b, 0x753c81ff, 0x8e031a19
+	.long		0x9924c781, 0xb9220208, 0x3edcde65
+	.long		0x3954de39, 0x1753ab84, 0x1d6708a0
+	.long		0xf32238b5, 0xbec81497, 0x9e70b943
+	.long		0xbbd2cd2c, 0x0925d861, 0xf7003835
+	.long		0xcc401304, 0xd784eaa8, 0xef82aa68
+	.long		0x4987e684, 0x6044fbb0, 0x00eba0c8
+	.long		0x3aa11427, 0x18fe3b4a, 0x87441142
+	.long		0x297aad60, 0x02072e24, 0xd14bcc9b
+	.long		0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
+	.long		0x632d78c5, 0x3fc33de4, 0x9a1b53c8
+	.long		0x25b8822a, 0x1e52f5ea, 0x99cce860
+	.long		0xd4fc84bc, 0x1af62fb8, 0x81b6f443
+	.long		0x5690aa32, 0xa91fdefb, 0x688a110e
+	.long		0x1357a093, 0x3796455c, 0xd8af8e46
+	.long		0x798fdd33, 0xaaa18a37, 0x357b9517
+	.long		0xc2815395, 0x54d42691, 0x9dbdc100
+	.long		0x21cfc0f7, 0x28ae0976, 0xf1996890
+	.long		0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6

From f260c442676310329b8b4482d3be59fe0e742220 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 21 Oct 2024 17:44:56 +0100
Subject: [PATCH 088/168] arm64: preserve pt_regs::stackframe during exec*()

When performing an exec*(), there's a transient period before the return
to userspace where any stacktrace will result in a warning triggered by
kunwind_next_frame_record_meta() encountering a struct frame_record_meta
with an unknown type. This can be seen fairly reliably by enabling KASAN
or KFENCE, e.g.

| WARNING: CPU: 3 PID: 143 at arch/arm64/kernel/stacktrace.c:223 arch_stack_walk+0x264/0x3b0
| Modules linked in:
| CPU: 3 UID: 0 PID: 143 Comm: login Not tainted 6.12.0-rc2-00010-g0f0b9a3f6a50 #1
| Hardware name: linux,dummy-virt (DT)
| pstate: 814000c5 (Nzcv daIF +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
| pc : arch_stack_walk+0x264/0x3b0
| lr : arch_stack_walk+0x1ec/0x3b0
| sp : ffff80008060b970
| x29: ffff80008060ba10 x28: fff00000051133c0 x27: 0000000000000000
| x26: 0000000000000000 x25: 0000000000000000 x24: fff000007fe84000
| x23: ffff9d1b3c940af0 x22: 0000000000000000 x21: fff00000051133c0
| x20: ffff80008060ba50 x19: ffff9d1b3c9408e0 x18: 0000000000000014
| x17: 000000006d50da47 x16: 000000008e3f265e x15: fff0000004e8bf40
| x14: 0000ffffc5e50e48 x13: 000000000000000f x12: 0000ffffc5e50fed
| x11: 000000000000001f x10: 000018007f8bffff x9 : 0000000000000000
| x8 : ffff80008060b9c0 x7 : ffff80008060bfd8 x6 : ffff80008060ba80
| x5 : ffff80008060ba00 x4 : ffff80008060c000 x3 : ffff80008060bff0
| x2 : 0000000000000018 x1 : ffff80008060bfd8 x0 : 0000000000000000
| Call trace:
|  arch_stack_walk+0x264/0x3b0 (P)
|  arch_stack_walk+0x1ec/0x3b0 (L)
|  stack_trace_save+0x50/0x80
|  metadata_update_state+0x98/0xa0
|  kfence_guarded_free+0xec/0x2c4
|  __kfence_free+0x50/0x100
|  kmem_cache_free+0x1a4/0x37c
|  putname+0x9c/0xc0
|  do_execveat_common.isra.0+0xf0/0x1e4
|  __arm64_sys_execve+0x40/0x60
|  invoke_syscall+0x48/0x104
|  el0_svc_common.constprop.0+0x40/0xe0
|  do_el0_svc+0x1c/0x28
|  el0_svc+0x34/0xe0
|  el0t_64_sync_handler+0x120/0x12c
|  el0t_64_sync+0x198/0x19c

This happens because start_thread_common() zeroes the entirety of
current_pt_regs(), including pt_regs::stackframe::type, changing this
from FRAME_META_TYPE_FINAL to 0 and making the final record invalid.
The stacktrace code will reject this until the next return to userspace,
where a subsequent exception entry will reinitialize the type to
FRAME_META_TYPE_FINAL.

This zeroing wasn't a problem prior to commit:

  c2c6b27b5aa14fa2 ("arm64: stacktrace: unwind exception boundaries")

... as before that commit the stacktrace code only expected the final
pt_regs::stackframe to contain zeroes, which was unchanged by
start_thread_common().

A stacktrace could occur at any time, either due to instrumentation or
an exception, and so start_thread_common() must ensure that
pt_regs::stackframe is always valid.

Fix this by changing the way start_thread_common() zeroes and
reinitializes the pt_regs fields:

* The '{regs,pc,pstate}' fields are initialized in one go via a struct
  assignment to the user_regs, with start_thread() and
  compat_start_thread() modified to pass 'pstate' into
  start_thread_common().

* The 'sp' and 'compat_sp' fields are zeroed by the struct assignment in
  start_thread_common(), and subsequently overwritten in start_thread()
  and compat_start_thread respectively, matching existing behaviour.

* The 'syscallno' field is implicitly preserved while the 'orig_x0'
  field is explicitly zeroed, maintaining existing ABI.

* The 'pmr' field is explicitly initialized, as necessary for an exec*()
  from a kernel thread, matching existing behaviour.

* The 'stackframe' field is implicitly preserved, with a new comment and
  some assertions to ensure we don't accidentally break this in future.

* All other fields are implicitly preserved, and should have no
  functional impact:

  - 'sdei_ttbr1' is only used for SDEI exception entry/exit, and we
    never exec*() inside an SDEI handler.

  - 'lockdep_hardirqs' and 'exit_rcu' are only used for EL1 exception
    entry/exit, and we never exec*() inside an EL1 exception handler.

While updating compat_start_thread() to pass 'pstate' into
start_thread_common(), I've also updated the logic to remove the
ifdeffery, replacing:

| #ifdef __AARCH64EB__
|        regs->pstate |= PSR_AA32_E_BIT;
| #endif

... with:

| if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
|         pstate |= PSR_AA32_E_BIT;

... which should be functionally equivalent, and matches our preferred
code style.

Fixes: c2c6b27b5aa1 ("arm64: stacktrace: unwind exception boundaries")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Puranjay Mohan <puranjay12@gmail.com>
Cc: Will Deacon <will@kernel.org>
Fixes: c2c6b27b5aa1 ("arm64: stacktrace: unwind exception boundaries")
Tested-by: Puranjay Mohan <puranjay12@gmail.com>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Link: https://lore.kernel.org/r/20241021164456.2275285-1-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/processor.h | 48 +++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 03b99164f7f4..e277105fb87a 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -285,22 +285,44 @@ void tls_preserve_current_state(void);
 	.fpsimd_cpu = NR_CPUS,			\
 }
 
-static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
+static inline void start_thread_common(struct pt_regs *regs, unsigned long pc,
+				       unsigned long pstate)
 {
-	s32 previous_syscall = regs->syscallno;
-	memset(regs, 0, sizeof(*regs));
-	regs->syscallno = previous_syscall;
-	regs->pc = pc;
+	/*
+	 * Ensure all GPRs are zeroed, and initialize PC + PSTATE.
+	 * The SP (or compat SP) will be initialized later.
+	 */
+	regs->user_regs = (struct user_pt_regs) {
+		.pc = pc,
+		.pstate = pstate,
+	};
 
+	/*
+	 * To allow the syscalls:sys_exit_execve tracepoint we need to preserve
+	 * syscallno, but do not need orig_x0 or the original GPRs.
+	 */
+	regs->orig_x0 = 0;
+
+	/*
+	 * An exec from a kernel thread won't have an existing PMR value.
+	 */
 	if (system_uses_irq_prio_masking())
 		regs->pmr = GIC_PRIO_IRQON;
+
+	/*
+	 * The pt_regs::stackframe field must remain valid throughout this
+	 * function as a stacktrace can be taken at any time. Any user or
+	 * kernel task should have a valid final frame.
+	 */
+	WARN_ON_ONCE(regs->stackframe.record.fp != 0);
+	WARN_ON_ONCE(regs->stackframe.record.lr != 0);
+	WARN_ON_ONCE(regs->stackframe.type != FRAME_META_TYPE_FINAL);
 }
 
 static inline void start_thread(struct pt_regs *regs, unsigned long pc,
 				unsigned long sp)
 {
-	start_thread_common(regs, pc);
-	regs->pstate = PSR_MODE_EL0t;
+	start_thread_common(regs, pc, PSR_MODE_EL0t);
 	spectre_v4_enable_task_mitigation(current);
 	regs->sp = sp;
 }
@@ -309,15 +331,13 @@ static inline void start_thread(struct pt_regs *regs, unsigned long pc,
 static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
 				       unsigned long sp)
 {
-	start_thread_common(regs, pc);
-	regs->pstate = PSR_AA32_MODE_USR;
+	unsigned long pstate = PSR_AA32_MODE_USR;
 	if (pc & 1)
-		regs->pstate |= PSR_AA32_T_BIT;
-
-#ifdef __AARCH64EB__
-	regs->pstate |= PSR_AA32_E_BIT;
-#endif
+		pstate |= PSR_AA32_T_BIT;
+	if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+		pstate |= PSR_AA32_E_BIT;
 
+	start_thread_common(regs, pc, pstate);
 	spectre_v4_enable_task_mitigation(current);
 	regs->compat_sp = sp;
 }

From 7a08cb9b4bb92fb86f5fe8a3aa0ac08a9b3d783b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 17 Oct 2024 18:43:31 +0100
Subject: [PATCH 089/168] kselftest/arm64: Fail the overall fp-stress test if
 any test fails

Currently fp-stress does not report a top level test result if it runs to
completion, it always exits with a return code 0. Use the ksft_finished()
helper to ensure that the exit code for the top level program reports a
failure if any of the individual tests has failed.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241017-arm64-fp-stress-exit-code-v1-1-f528e53a2321@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index faac24bdefeb..e62c9dbad501 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -651,7 +651,5 @@ int main(int argc, char **argv)
 
 	drain_output(true);
 
-	ksft_print_cnts();
-
-	return 0;
+	ksft_finished();
 }

From b880a80011f56880f32bde47fc6af313359f926b Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 17 Oct 2024 14:14:24 +0100
Subject: [PATCH 090/168] arm64: rsi: Add RSI definitions

The RMM (Realm Management Monitor) provides functionality that can be
accessed by a realm guest through SMC (Realm Services Interface) calls.

The SMC definitions are based on DEN0137[1] version 1.0-rel0.

[1] https://developer.arm.com/documentation/den0137/1-0rel0/

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-2-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/rsi_cmds.h | 139 +++++++++++++++++++++
 arch/arm64/include/asm/rsi_smc.h  | 193 ++++++++++++++++++++++++++++++
 2 files changed, 332 insertions(+)
 create mode 100644 arch/arm64/include/asm/rsi_cmds.h
 create mode 100644 arch/arm64/include/asm/rsi_smc.h

diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h
new file mode 100644
index 000000000000..2fcf351b5634
--- /dev/null
+++ b/arch/arm64/include/asm/rsi_cmds.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#ifndef __ASM_RSI_CMDS_H
+#define __ASM_RSI_CMDS_H
+
+#include <linux/arm-smccc.h>
+
+#include <asm/rsi_smc.h>
+
+#define RSI_GRANULE_SHIFT		12
+#define RSI_GRANULE_SIZE		(_AC(1, UL) << RSI_GRANULE_SHIFT)
+
+enum ripas {
+	RSI_RIPAS_EMPTY = 0,
+	RSI_RIPAS_RAM = 1,
+	RSI_RIPAS_DESTROYED = 2,
+	RSI_RIPAS_DEV = 3,
+};
+
+static inline unsigned long rsi_request_version(unsigned long req,
+						unsigned long *out_lower,
+						unsigned long *out_higher)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_smc(SMC_RSI_ABI_VERSION, req, 0, 0, 0, 0, 0, 0, &res);
+
+	if (out_lower)
+		*out_lower = res.a1;
+	if (out_higher)
+		*out_higher = res.a2;
+
+	return res.a0;
+}
+
+static inline unsigned long rsi_get_realm_config(struct realm_config *cfg)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_smc(SMC_RSI_REALM_CONFIG, virt_to_phys(cfg),
+		      0, 0, 0, 0, 0, 0, &res);
+	return res.a0;
+}
+
+static inline long rsi_set_addr_range_state(phys_addr_t start,
+					    phys_addr_t end,
+					    enum ripas state,
+					    unsigned long flags,
+					    phys_addr_t *top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_smc(SMC_RSI_IPA_STATE_SET, start, end, state,
+		      flags, 0, 0, 0, &res);
+
+	if (top)
+		*top = res.a1;
+
+	if (res.a2 != RSI_ACCEPT)
+		return -EPERM;
+
+	return res.a0;
+}
+
+/**
+ * rsi_attestation_token_init - Initialise the operation to retrieve an
+ * attestation token.
+ *
+ * @challenge:	The challenge data to be used in the attestation token
+ *		generation.
+ * @size:	Size of the challenge data in bytes.
+ *
+ * Initialises the attestation token generation and returns an upper bound
+ * on the attestation token size that can be used to allocate an adequate
+ * buffer. The caller is expected to subsequently call
+ * rsi_attestation_token_continue() to retrieve the attestation token data on
+ * the same CPU.
+ *
+ * Returns:
+ *  On success, returns the upper limit of the attestation report size.
+ *  Otherwise, -EINVAL
+ */
+static inline long
+rsi_attestation_token_init(const u8 *challenge, unsigned long size)
+{
+	struct arm_smccc_1_2_regs regs = { 0 };
+
+	/* The challenge must be at least 32bytes and at most 64bytes */
+	if (!challenge || size < 32 || size > 64)
+		return -EINVAL;
+
+	regs.a0 = SMC_RSI_ATTESTATION_TOKEN_INIT;
+	memcpy(&regs.a1, challenge, size);
+	arm_smccc_1_2_smc(&regs, &regs);
+
+	if (regs.a0 == RSI_SUCCESS)
+		return regs.a1;
+
+	return -EINVAL;
+}
+
+/**
+ * rsi_attestation_token_continue - Continue the operation to retrieve an
+ * attestation token.
+ *
+ * @granule: {I}PA of the Granule to which the token will be written.
+ * @offset:  Offset within Granule to start of buffer in bytes.
+ * @size:    The size of the buffer.
+ * @len:     The number of bytes written to the buffer.
+ *
+ * Retrieves up to a RSI_GRANULE_SIZE worth of token data per call. The caller
+ * is expected to call rsi_attestation_token_init() before calling this
+ * function to retrieve the attestation token.
+ *
+ * Return:
+ * * %RSI_SUCCESS     - Attestation token retrieved successfully.
+ * * %RSI_INCOMPLETE  - Token generation is not complete.
+ * * %RSI_ERROR_INPUT - A parameter was not valid.
+ * * %RSI_ERROR_STATE - Attestation not in progress.
+ */
+static inline unsigned long rsi_attestation_token_continue(phys_addr_t granule,
+							   unsigned long offset,
+							   unsigned long size,
+							   unsigned long *len)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RSI_ATTESTATION_TOKEN_CONTINUE,
+			     granule, offset, size, 0, &res);
+
+	if (len)
+		*len = res.a1;
+	return res.a0;
+}
+
+#endif /* __ASM_RSI_CMDS_H */
diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h
new file mode 100644
index 000000000000..6cb070eca9e9
--- /dev/null
+++ b/arch/arm64/include/asm/rsi_smc.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#ifndef __ASM_RSI_SMC_H_
+#define __ASM_RSI_SMC_H_
+
+#include <linux/arm-smccc.h>
+
+/*
+ * This file describes the Realm Services Interface (RSI) Application Binary
+ * Interface (ABI) for SMC calls made from within the Realm to the RMM and
+ * serviced by the RMM.
+ */
+
+/*
+ * The major version number of the RSI implementation.  This is increased when
+ * the binary format or semantics of the SMC calls change.
+ */
+#define RSI_ABI_VERSION_MAJOR		UL(1)
+
+/*
+ * The minor version number of the RSI implementation.  This is increased when
+ * a bug is fixed, or a feature is added without breaking binary compatibility.
+ */
+#define RSI_ABI_VERSION_MINOR		UL(0)
+
+#define RSI_ABI_VERSION			((RSI_ABI_VERSION_MAJOR << 16) | \
+					 RSI_ABI_VERSION_MINOR)
+
+#define RSI_ABI_VERSION_GET_MAJOR(_version) ((_version) >> 16)
+#define RSI_ABI_VERSION_GET_MINOR(_version) ((_version) & 0xFFFF)
+
+#define RSI_SUCCESS		UL(0)
+#define RSI_ERROR_INPUT		UL(1)
+#define RSI_ERROR_STATE		UL(2)
+#define RSI_INCOMPLETE		UL(3)
+#define RSI_ERROR_UNKNOWN	UL(4)
+
+#define SMC_RSI_FID(n)		ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,      \
+						   ARM_SMCCC_SMC_64,         \
+						   ARM_SMCCC_OWNER_STANDARD, \
+						   n)
+
+/*
+ * Returns RSI version.
+ *
+ * arg1 == Requested interface revision
+ * ret0 == Status / error
+ * ret1 == Lower implemented interface revision
+ * ret2 == Higher implemented interface revision
+ */
+#define SMC_RSI_ABI_VERSION	SMC_RSI_FID(0x190)
+
+/*
+ * Read feature register.
+ *
+ * arg1 == Feature register index
+ * ret0 == Status / error
+ * ret1 == Feature register value
+ */
+#define SMC_RSI_FEATURES			SMC_RSI_FID(0x191)
+
+/*
+ * Read measurement for the current Realm.
+ *
+ * arg1 == Index, which measurements slot to read
+ * ret0 == Status / error
+ * ret1 == Measurement value, bytes:  0 -  7
+ * ret2 == Measurement value, bytes:  8 - 15
+ * ret3 == Measurement value, bytes: 16 - 23
+ * ret4 == Measurement value, bytes: 24 - 31
+ * ret5 == Measurement value, bytes: 32 - 39
+ * ret6 == Measurement value, bytes: 40 - 47
+ * ret7 == Measurement value, bytes: 48 - 55
+ * ret8 == Measurement value, bytes: 56 - 63
+ */
+#define SMC_RSI_MEASUREMENT_READ		SMC_RSI_FID(0x192)
+
+/*
+ * Extend Realm Extensible Measurement (REM) value.
+ *
+ * arg1  == Index, which measurements slot to extend
+ * arg2  == Size of realm measurement in bytes, max 64 bytes
+ * arg3  == Measurement value, bytes:  0 -  7
+ * arg4  == Measurement value, bytes:  8 - 15
+ * arg5  == Measurement value, bytes: 16 - 23
+ * arg6  == Measurement value, bytes: 24 - 31
+ * arg7  == Measurement value, bytes: 32 - 39
+ * arg8  == Measurement value, bytes: 40 - 47
+ * arg9  == Measurement value, bytes: 48 - 55
+ * arg10 == Measurement value, bytes: 56 - 63
+ * ret0  == Status / error
+ */
+#define SMC_RSI_MEASUREMENT_EXTEND		SMC_RSI_FID(0x193)
+
+/*
+ * Initialize the operation to retrieve an attestation token.
+ *
+ * arg1 == Challenge value, bytes:  0 -  7
+ * arg2 == Challenge value, bytes:  8 - 15
+ * arg3 == Challenge value, bytes: 16 - 23
+ * arg4 == Challenge value, bytes: 24 - 31
+ * arg5 == Challenge value, bytes: 32 - 39
+ * arg6 == Challenge value, bytes: 40 - 47
+ * arg7 == Challenge value, bytes: 48 - 55
+ * arg8 == Challenge value, bytes: 56 - 63
+ * ret0 == Status / error
+ * ret1 == Upper bound of token size in bytes
+ */
+#define SMC_RSI_ATTESTATION_TOKEN_INIT		SMC_RSI_FID(0x194)
+
+/*
+ * Continue the operation to retrieve an attestation token.
+ *
+ * arg1 == The IPA of token buffer
+ * arg2 == Offset within the granule of the token buffer
+ * arg3 == Size of the granule buffer
+ * ret0 == Status / error
+ * ret1 == Length of token bytes copied to the granule buffer
+ */
+#define SMC_RSI_ATTESTATION_TOKEN_CONTINUE	SMC_RSI_FID(0x195)
+
+#ifndef __ASSEMBLY__
+
+struct realm_config {
+	union {
+		struct {
+			unsigned long ipa_bits; /* Width of IPA in bits */
+			unsigned long hash_algo; /* Hash algorithm */
+		};
+		u8 pad[0x200];
+	};
+	union {
+		u8 rpv[64]; /* Realm Personalization Value */
+		u8 pad2[0xe00];
+	};
+	/*
+	 * The RMM requires the configuration structure to be aligned to a 4k
+	 * boundary, ensure this happens by aligning this structure.
+	 */
+} __aligned(0x1000);
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * Read configuration for the current Realm.
+ *
+ * arg1 == struct realm_config addr
+ * ret0 == Status / error
+ */
+#define SMC_RSI_REALM_CONFIG			SMC_RSI_FID(0x196)
+
+/*
+ * Request RIPAS of a target IPA range to be changed to a specified value.
+ *
+ * arg1 == Base IPA address of target region
+ * arg2 == Top of the region
+ * arg3 == RIPAS value
+ * arg4 == flags
+ * ret0 == Status / error
+ * ret1 == Top of modified IPA range
+ * ret2 == Whether the Host accepted or rejected the request
+ */
+#define SMC_RSI_IPA_STATE_SET			SMC_RSI_FID(0x197)
+
+#define RSI_NO_CHANGE_DESTROYED			UL(0)
+#define RSI_CHANGE_DESTROYED			UL(1)
+
+#define RSI_ACCEPT				UL(0)
+#define RSI_REJECT				UL(1)
+
+/*
+ * Get RIPAS of a target IPA range.
+ *
+ * arg1 == Base IPA of target region
+ * arg2 == End of target IPA region
+ * ret0 == Status / error
+ * ret1 == Top of IPA region which has the reported RIPAS value
+ * ret2 == RIPAS value
+ */
+#define SMC_RSI_IPA_STATE_GET			SMC_RSI_FID(0x198)
+
+/*
+ * Make a Host call.
+ *
+ * arg1 == IPA of host call structure
+ * ret0 == Status / error
+ */
+#define SMC_RSI_HOST_CALL			SMC_RSI_FID(0x199)
+
+#endif /* __ASM_RSI_SMC_H_ */

From c077711f718be7cebcc8b987eac2ebfd17447e9f Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 17 Oct 2024 14:14:25 +0100
Subject: [PATCH 091/168] arm64: Detect if in a realm and set RIPAS RAM

Detect that the VM is a realm guest by the presence of the RSI
interface. This is done after PSCI has been initialised so that we can
check the SMCCC conduit before making any RSI calls.

If in a realm then iterate over all memory ensuring that it is marked as
RIPAS RAM. The loader is required to do this for us, however if some
memory is missed this will cause the guest to receive a hard to debug
external abort at some random point in the future. So for a
belt-and-braces approach set all memory to RIPAS RAM. Any failure here
implies that the RAM regions passed to Linux are incorrect so panic()
promptly to make the situation clear.

Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Co-developed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-3-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/rsi.h | 66 +++++++++++++++++++++++++++++++
 arch/arm64/kernel/Makefile   |  3 +-
 arch/arm64/kernel/rsi.c      | 76 ++++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/setup.c    |  3 ++
 4 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/rsi.h
 create mode 100644 arch/arm64/kernel/rsi.c

diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h
new file mode 100644
index 000000000000..acba065eb00e
--- /dev/null
+++ b/arch/arm64/include/asm/rsi.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2024 ARM Ltd.
+ */
+
+#ifndef __ASM_RSI_H_
+#define __ASM_RSI_H_
+
+#include <linux/errno.h>
+#include <linux/jump_label.h>
+#include <asm/rsi_cmds.h>
+
+DECLARE_STATIC_KEY_FALSE(rsi_present);
+
+void __init arm64_rsi_init(void);
+
+static inline bool is_realm_world(void)
+{
+	return static_branch_unlikely(&rsi_present);
+}
+
+static inline int rsi_set_memory_range(phys_addr_t start, phys_addr_t end,
+				       enum ripas state, unsigned long flags)
+{
+	unsigned long ret;
+	phys_addr_t top;
+
+	while (start != end) {
+		ret = rsi_set_addr_range_state(start, end, state, flags, &top);
+		if (ret || top < start || top > end)
+			return -EINVAL;
+		start = top;
+	}
+
+	return 0;
+}
+
+/*
+ * Convert the specified range to RAM. Do not use this if you rely on the
+ * contents of a page that may already be in RAM state.
+ */
+static inline int rsi_set_memory_range_protected(phys_addr_t start,
+						 phys_addr_t end)
+{
+	return rsi_set_memory_range(start, end, RSI_RIPAS_RAM,
+				    RSI_CHANGE_DESTROYED);
+}
+
+/*
+ * Convert the specified range to RAM. Do not convert any pages that may have
+ * been DESTROYED, without our permission.
+ */
+static inline int rsi_set_memory_range_protected_safe(phys_addr_t start,
+						      phys_addr_t end)
+{
+	return rsi_set_memory_range(start, end, RSI_RIPAS_RAM,
+				    RSI_NO_CHANGE_DESTROYED);
+}
+
+static inline int rsi_set_memory_range_shared(phys_addr_t start,
+					      phys_addr_t end)
+{
+	return rsi_set_memory_range(start, end, RSI_RIPAS_EMPTY,
+				    RSI_CHANGE_DESTROYED);
+}
+#endif /* __ASM_RSI_H_ */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2b112f3b7510..71c29a2a2f19 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -33,7 +33,8 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o proton-pack.o idle.o patching.o pi/
+			   syscall.o proton-pack.o idle.o patching.o pi/	\
+			   rsi.o
 
 obj-$(CONFIG_COMPAT)			+= sys32.o signal32.o			\
 					   sys_compat.o
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
new file mode 100644
index 000000000000..c5758317dfed
--- /dev/null
+++ b/arch/arm64/kernel/rsi.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#include <linux/jump_label.h>
+#include <linux/memblock.h>
+#include <linux/psci.h>
+#include <asm/rsi.h>
+
+DEFINE_STATIC_KEY_FALSE_RO(rsi_present);
+EXPORT_SYMBOL(rsi_present);
+
+static bool rsi_version_matches(void)
+{
+	unsigned long ver_lower, ver_higher;
+	unsigned long ret = rsi_request_version(RSI_ABI_VERSION,
+						&ver_lower,
+						&ver_higher);
+
+	if (ret == SMCCC_RET_NOT_SUPPORTED)
+		return false;
+
+	if (ret != RSI_SUCCESS) {
+		pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n",
+		       RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR,
+		       RSI_ABI_VERSION_GET_MAJOR(ver_lower),
+		       RSI_ABI_VERSION_GET_MINOR(ver_lower),
+		       RSI_ABI_VERSION_GET_MAJOR(ver_higher),
+		       RSI_ABI_VERSION_GET_MINOR(ver_higher));
+		return false;
+	}
+
+	pr_info("RME: Using RSI version %lu.%lu\n",
+		RSI_ABI_VERSION_GET_MAJOR(ver_lower),
+		RSI_ABI_VERSION_GET_MINOR(ver_lower));
+
+	return true;
+}
+
+static void __init arm64_rsi_setup_memory(void)
+{
+	u64 i;
+	phys_addr_t start, end;
+
+	/*
+	 * Iterate over the available memory ranges and convert the state to
+	 * protected memory. We should take extra care to ensure that we DO NOT
+	 * permit any "DESTROYED" pages to be converted to "RAM".
+	 *
+	 * panic() is used because if the attempt to switch the memory to
+	 * protected has failed here, then future accesses to the memory are
+	 * simply going to be reflected as a SEA (Synchronous External Abort)
+	 * which we can't handle.  Bailing out early prevents the guest limping
+	 * on and dying later.
+	 */
+	for_each_mem_range(i, &start, &end) {
+		if (rsi_set_memory_range_protected_safe(start, end)) {
+			panic("Failed to set memory range to protected: %pa-%pa",
+			      &start, &end);
+		}
+	}
+}
+
+void __init arm64_rsi_init(void)
+{
+	if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC)
+		return;
+	if (!rsi_version_matches())
+		return;
+
+	arm64_rsi_setup_memory();
+
+	static_branch_enable(&rsi_present);
+}
+
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index b22d28ec8028..b5e1e306fa51 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -43,6 +43,7 @@
 #include <asm/cpu_ops.h>
 #include <asm/kasan.h>
 #include <asm/numa.h>
+#include <asm/rsi.h>
 #include <asm/scs.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
@@ -351,6 +352,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	else
 		psci_acpi_init();
 
+	arm64_rsi_init();
+
 	init_bootcpu_ops();
 	smp_init_cpus();
 	smp_build_mpidr_hash();

From 399306954996be58ac20b4b29f6334e3d55a2ce7 Mon Sep 17 00:00:00 2001
From: Steven Price <steven.price@arm.com>
Date: Thu, 17 Oct 2024 14:14:26 +0100
Subject: [PATCH 092/168] arm64: realm: Query IPA size from the RMM

The top bit of the configured IPA size is used as an attribute to
control whether the address is protected or shared. Query the
configuration from the RMM to assertain which bit this is.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-4-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable-prot.h | 4 ++++
 arch/arm64/kernel/rsi.c               | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 2a11d0c10760..820a3b06f08c 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -68,8 +68,12 @@
 
 #include <asm/cpufeature.h>
 #include <asm/pgtable-types.h>
+#include <asm/rsi.h>
 
 extern bool arm64_use_ng_mappings;
+extern unsigned long prot_ns_shared;
+
+#define PROT_NS_SHARED		(is_realm_world() ? prot_ns_shared : 0)
 
 #define PTE_MAYBE_NG		(arm64_use_ng_mappings ? PTE_NG : 0)
 #define PMD_MAYBE_NG		(arm64_use_ng_mappings ? PMD_SECT_NG : 0)
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index c5758317dfed..cea8f0d39591 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -8,6 +8,11 @@
 #include <linux/psci.h>
 #include <asm/rsi.h>
 
+static struct realm_config config;
+
+unsigned long prot_ns_shared;
+EXPORT_SYMBOL(prot_ns_shared);
+
 DEFINE_STATIC_KEY_FALSE_RO(rsi_present);
 EXPORT_SYMBOL(rsi_present);
 
@@ -68,6 +73,9 @@ void __init arm64_rsi_init(void)
 		return;
 	if (!rsi_version_matches())
 		return;
+	if (WARN_ON(rsi_get_realm_config(&config)))
+		return;
+	prot_ns_shared = BIT(config.ipa_bits - 1);
 
 	arm64_rsi_setup_memory();
 

From 371589437616fbb03590d8ff505f8a4c95c8a031 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 17 Oct 2024 14:14:27 +0100
Subject: [PATCH 093/168] arm64: rsi: Add support for checking whether an MMIO
 is protected

On Arm CCA, with RMM-v1.0, all MMIO regions are shared. However, in
the future, an Arm CCA-v1.0 compliant guest may be run in a lesser
privileged partition in the Realm World (with Arm CCA-v1.1 Planes
feature). In this case, some of the MMIO regions may be emulated
by a higher privileged component in the Realm world, i.e, protected.

Thus the guest must decide today, whether a given MMIO region is shared
vs Protected and create the stage1 mapping accordingly. On Arm CCA, this
detection is based on the "IPA State" (RIPAS == RIPAS_IO). Provide a
helper to run this check on a given range of MMIO.

Also, provide a arm64 helper which may be hooked in by other solutions.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-5-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/io.h       |  8 ++++++++
 arch/arm64/include/asm/rsi.h      |  2 ++
 arch/arm64/include/asm/rsi_cmds.h | 21 +++++++++++++++++++++
 arch/arm64/kernel/rsi.c           | 26 ++++++++++++++++++++++++++
 4 files changed, 57 insertions(+)

diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 1ada23a6ec19..8688343b71f2 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -17,6 +17,7 @@
 #include <asm/early_ioremap.h>
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
+#include <asm/rsi.h>
 
 /*
  * Generic IO read/write.  These perform native-endian accesses.
@@ -318,4 +319,11 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
 					unsigned long flags);
 #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
 
+static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size)
+{
+	if (unlikely(is_realm_world()))
+		return __arm64_is_protected_mmio(phys_addr, size);
+	return false;
+}
+
 #endif	/* __ASM_IO_H */
diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h
index acba065eb00e..188cbb9b23f5 100644
--- a/arch/arm64/include/asm/rsi.h
+++ b/arch/arm64/include/asm/rsi.h
@@ -14,6 +14,8 @@ DECLARE_STATIC_KEY_FALSE(rsi_present);
 
 void __init arm64_rsi_init(void);
 
+bool __arm64_is_protected_mmio(phys_addr_t base, size_t size);
+
 static inline bool is_realm_world(void)
 {
 	return static_branch_unlikely(&rsi_present);
diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h
index 2fcf351b5634..e6a211001bd3 100644
--- a/arch/arm64/include/asm/rsi_cmds.h
+++ b/arch/arm64/include/asm/rsi_cmds.h
@@ -45,6 +45,27 @@ static inline unsigned long rsi_get_realm_config(struct realm_config *cfg)
 	return res.a0;
 }
 
+static inline unsigned long rsi_ipa_state_get(phys_addr_t start,
+					      phys_addr_t end,
+					      enum ripas *state,
+					      phys_addr_t *top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_smc(SMC_RSI_IPA_STATE_GET,
+		      start, end, 0, 0, 0, 0, 0,
+		      &res);
+
+	if (res.a0 == RSI_SUCCESS) {
+		if (top)
+			*top = res.a1;
+		if (state)
+			*state = res.a2;
+	}
+
+	return res.a0;
+}
+
 static inline long rsi_set_addr_range_state(phys_addr_t start,
 					    phys_addr_t end,
 					    enum ripas state,
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index cea8f0d39591..7e7934c4fca0 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -67,6 +67,32 @@ static void __init arm64_rsi_setup_memory(void)
 	}
 }
 
+bool __arm64_is_protected_mmio(phys_addr_t base, size_t size)
+{
+	enum ripas ripas;
+	phys_addr_t end, top;
+
+	/* Overflow ? */
+	if (WARN_ON(base + size <= base))
+		return false;
+
+	end = ALIGN(base + size, RSI_GRANULE_SIZE);
+	base = ALIGN_DOWN(base, RSI_GRANULE_SIZE);
+
+	while (base < end) {
+		if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top)))
+			break;
+		if (WARN_ON(top <= base))
+			break;
+		if (ripas != RSI_RIPAS_DEV)
+			break;
+		base = top;
+	}
+
+	return base >= end;
+}
+EXPORT_SYMBOL(__arm64_is_protected_mmio);
+
 void __init arm64_rsi_init(void)
 {
 	if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC)

From 3c6c706139564f74ec48229378873c1d930a8bc8 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 17 Oct 2024 14:14:28 +0100
Subject: [PATCH 094/168] arm64: rsi: Map unprotected MMIO as decrypted

Instead of marking every MMIO as shared, check if the given region is
"Protected" and apply the permissions accordingly.

Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-6-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/rsi.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index 7e7934c4fca0..3e0c83e2296f 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -6,6 +6,8 @@
 #include <linux/jump_label.h>
 #include <linux/memblock.h>
 #include <linux/psci.h>
+
+#include <asm/io.h>
 #include <asm/rsi.h>
 
 static struct realm_config config;
@@ -93,6 +95,16 @@ bool __arm64_is_protected_mmio(phys_addr_t base, size_t size)
 }
 EXPORT_SYMBOL(__arm64_is_protected_mmio);
 
+static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot)
+{
+	if (__arm64_is_protected_mmio(phys, size))
+		*prot = pgprot_encrypted(*prot);
+	else
+		*prot = pgprot_decrypted(*prot);
+
+	return 0;
+}
+
 void __init arm64_rsi_init(void)
 {
 	if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC)
@@ -103,6 +115,9 @@ void __init arm64_rsi_init(void)
 		return;
 	prot_ns_shared = BIT(config.ipa_bits - 1);
 
+	if (arm64_ioremap_prot_hook_register(realm_ioremap_hook))
+		return;
+
 	arm64_rsi_setup_memory();
 
 	static_branch_enable(&rsi_present);

From 491db21d8256992ab9fe11c42744eb3044315d14 Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 17 Oct 2024 14:14:29 +0100
Subject: [PATCH 095/168] efi: arm64: Map Device with Prot Shared

Device mappings need to be emulated by the VMM so must be mapped shared
with the host.

Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-7-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/efi.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 712718aed5dd..1d25d8899dbf 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -34,8 +34,16 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md)
 	u64 attr = md->attribute;
 	u32 type = md->type;
 
-	if (type == EFI_MEMORY_MAPPED_IO)
-		return PROT_DEVICE_nGnRE;
+	if (type == EFI_MEMORY_MAPPED_IO) {
+		pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE);
+
+		if (arm64_is_protected_mmio(md->phys_addr,
+					    md->num_pages << EFI_PAGE_SHIFT))
+			prot = pgprot_encrypted(prot);
+		else
+			prot = pgprot_decrypted(prot);
+		return pgprot_val(prot);
+	}
 
 	if (region_is_misaligned(md)) {
 		static bool __initdata code_is_misaligned;

From fbf979a01375704fa87c559763209c658593b6f8 Mon Sep 17 00:00:00 2001
From: Steven Price <steven.price@arm.com>
Date: Thu, 17 Oct 2024 14:14:30 +0100
Subject: [PATCH 096/168] arm64: Enforce bounce buffers for realm DMA

Within a realm guest it's not possible for a device emulated by the VMM
to access arbitrary guest memory. So force the use of bounce buffers to
ensure that the memory the emulated devices are accessing is in memory
which is explicitly shared with the host.

This adds a call to swiotlb_update_mem_attributes() which calls
set_memory_decrypted() to ensure the bounce buffer memory is shared with
the host. For non-realm guests or hosts this is a no-op.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-8-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/rsi.c |  1 +
 arch/arm64/mm/init.c    | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index 3e0c83e2296f..a23c0a7154d2 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -6,6 +6,7 @@
 #include <linux/jump_label.h>
 #include <linux/memblock.h>
 #include <linux/psci.h>
+#include <linux/swiotlb.h>
 
 #include <asm/io.h>
 #include <asm/rsi.h>
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 27a32ff15412..d21f67d67cf5 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -41,6 +41,7 @@
 #include <asm/kvm_host.h>
 #include <asm/memory.h>
 #include <asm/numa.h>
+#include <asm/rsi.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <linux/sizes.h>
@@ -366,8 +367,14 @@ void __init bootmem_init(void)
  */
 void __init mem_init(void)
 {
+	unsigned int flags = SWIOTLB_VERBOSE;
 	bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
 
+	if (is_realm_world()) {
+		swiotlb = true;
+		flags |= SWIOTLB_FORCE;
+	}
+
 	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
 		/*
 		 * If no bouncing needed for ZONE_DMA, reduce the swiotlb
@@ -379,7 +386,8 @@ void __init mem_init(void)
 		swiotlb = true;
 	}
 
-	swiotlb_init(swiotlb, SWIOTLB_VERBOSE);
+	swiotlb_init(swiotlb, flags);
+	swiotlb_update_mem_attributes();
 
 	/* this will put all unused low memory onto the freelists */
 	memblock_free_all();

From 0e9cb5995b2539a332fe65ada6a28a6be55f6e40 Mon Sep 17 00:00:00 2001
From: Steven Price <steven.price@arm.com>
Date: Thu, 17 Oct 2024 14:14:31 +0100
Subject: [PATCH 097/168] arm64: mm: Avoid TLBI when marking pages as valid

When __change_memory_common() is purely setting the valid bit on a PTE
(e.g. via the set_memory_valid() call) there is no need for a TLBI as
either the entry isn't changing (the valid bit was already set) or the
entry was invalid and so should not have been cached in the TLB.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-9-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/pageattr.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 0e270a1c51e6..547a9e0b46c2 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -60,7 +60,13 @@ static int __change_memory_common(unsigned long start, unsigned long size,
 	ret = apply_to_page_range(&init_mm, start, size, change_page_range,
 					&data);
 
-	flush_tlb_kernel_range(start, start + size);
+	/*
+	 * If the memory is being made valid without changing any other bits
+	 * then a TLBI isn't required as a non-valid entry cannot be cached in
+	 * the TLB.
+	 */
+	if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
+		flush_tlb_kernel_range(start, start + size);
 	return ret;
 }
 

From 42be24a4178fe51e6f47d91d8621b2f53820f88b Mon Sep 17 00:00:00 2001
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Date: Thu, 17 Oct 2024 14:14:32 +0100
Subject: [PATCH 098/168] arm64: Enable memory encrypt for Realms

Use the memory encryption APIs to trigger a RSI call to request a
transition between protected memory and shared memory (or vice versa)
and updating the kernel's linear map of modified pages to flip the top
bit of the IPA. This requires that block mappings are not used in the
direct map for realm guests.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Co-developed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-10-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig                   |  3 +
 arch/arm64/include/asm/mem_encrypt.h |  9 +++
 arch/arm64/include/asm/pgtable.h     |  5 ++
 arch/arm64/include/asm/set_memory.h  |  3 +
 arch/arm64/kernel/rsi.c              | 16 +++++
 arch/arm64/mm/pageattr.c             | 90 +++++++++++++++++++++++++++-
 6 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e29b44d2d7b..ccea9c22d6df 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -21,6 +21,7 @@ config ARM64
 	select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
 	select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
 	select ARCH_HAS_CACHE_LINE_SIZE
+	select ARCH_HAS_CC_PLATFORM
 	select ARCH_HAS_CURRENT_STACK_POINTER
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEBUG_VM_PGTABLE
@@ -44,6 +45,8 @@ config ARM64
 	select ARCH_HAS_SETUP_DMA_OPS
 	select ARCH_HAS_SET_DIRECT_MAP
 	select ARCH_HAS_SET_MEMORY
+	select ARCH_HAS_MEM_ENCRYPT
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	select ARCH_STACKWALK
 	select ARCH_HAS_STRICT_KERNEL_RWX
 	select ARCH_HAS_STRICT_MODULE_RWX
diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h
index b0c9a86b13a4..f8f78f622dd2 100644
--- a/arch/arm64/include/asm/mem_encrypt.h
+++ b/arch/arm64/include/asm/mem_encrypt.h
@@ -2,6 +2,8 @@
 #ifndef __ASM_MEM_ENCRYPT_H
 #define __ASM_MEM_ENCRYPT_H
 
+#include <asm/rsi.h>
+
 struct arm64_mem_crypt_ops {
 	int (*encrypt)(unsigned long addr, int numpages);
 	int (*decrypt)(unsigned long addr, int numpages);
@@ -12,4 +14,11 @@ int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops);
 int set_memory_encrypted(unsigned long addr, int numpages);
 int set_memory_decrypted(unsigned long addr, int numpages);
 
+int realm_register_memory_enc_ops(void);
+
+static inline bool force_dma_unencrypted(struct device *dev)
+{
+	return is_realm_world();
+}
+
 #endif	/* __ASM_MEM_ENCRYPT_H */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c329ea061dc9..7e4bdc8259a2 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -684,6 +684,11 @@ static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
 #define pgprot_nx(prot) \
 	__pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN)
 
+#define pgprot_decrypted(prot) \
+	__pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED)
+#define pgprot_encrypted(prot) \
+	__pgprot_modify(prot, PROT_NS_SHARED, 0)
+
 /*
  * Mark the prot value as uncacheable and unbufferable.
  */
diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
index 917761feeffd..37774c793006 100644
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@@ -15,4 +15,7 @@ int set_direct_map_invalid_noflush(struct page *page);
 int set_direct_map_default_noflush(struct page *page);
 bool kernel_page_present(struct page *page);
 
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
 #endif /* _ASM_ARM64_SET_MEMORY_H */
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index a23c0a7154d2..3031f25c32ef 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -7,8 +7,10 @@
 #include <linux/memblock.h>
 #include <linux/psci.h>
 #include <linux/swiotlb.h>
+#include <linux/cc_platform.h>
 
 #include <asm/io.h>
+#include <asm/mem_encrypt.h>
 #include <asm/rsi.h>
 
 static struct realm_config config;
@@ -19,6 +21,17 @@ EXPORT_SYMBOL(prot_ns_shared);
 DEFINE_STATIC_KEY_FALSE_RO(rsi_present);
 EXPORT_SYMBOL(rsi_present);
 
+bool cc_platform_has(enum cc_attr attr)
+{
+	switch (attr) {
+	case CC_ATTR_MEM_ENCRYPT:
+		return is_realm_world();
+	default:
+		return false;
+	}
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
+
 static bool rsi_version_matches(void)
 {
 	unsigned long ver_lower, ver_higher;
@@ -119,6 +132,9 @@ void __init arm64_rsi_init(void)
 	if (arm64_ioremap_prot_hook_register(realm_ioremap_hook))
 		return;
 
+	if (realm_register_memory_enc_ops())
+		return;
+
 	arm64_rsi_setup_memory();
 
 	static_branch_enable(&rsi_present);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 547a9e0b46c2..6ae6ae806454 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -5,10 +5,12 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/mem_encrypt.h>
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 
 #include <asm/cacheflush.h>
+#include <asm/pgtable-prot.h>
 #include <asm/set_memory.h>
 #include <asm/tlbflush.h>
 #include <asm/kfence.h>
@@ -23,14 +25,16 @@ bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED
 bool can_set_direct_map(void)
 {
 	/*
-	 * rodata_full and DEBUG_PAGEALLOC require linear map to be
-	 * mapped at page granularity, so that it is possible to
+	 * rodata_full, DEBUG_PAGEALLOC and a Realm guest all require linear
+	 * map to be mapped at page granularity, so that it is possible to
 	 * protect/unprotect single pages.
 	 *
 	 * KFENCE pool requires page-granular mapping if initialized late.
+	 *
+	 * Realms need to make pages shared/protected at page granularity.
 	 */
 	return rodata_full || debug_pagealloc_enabled() ||
-	       arm64_kfence_can_set_direct_map();
+		arm64_kfence_can_set_direct_map() || is_realm_world();
 }
 
 static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
@@ -198,6 +202,86 @@ int set_direct_map_default_noflush(struct page *page)
 				   PAGE_SIZE, change_page_range, &data);
 }
 
+static int __set_memory_enc_dec(unsigned long addr,
+				int numpages,
+				bool encrypt)
+{
+	unsigned long set_prot = 0, clear_prot = 0;
+	phys_addr_t start, end;
+	int ret;
+
+	if (!is_realm_world())
+		return 0;
+
+	if (!__is_lm_address(addr))
+		return -EINVAL;
+
+	start = __virt_to_phys(addr);
+	end = start + numpages * PAGE_SIZE;
+
+	if (encrypt)
+		clear_prot = PROT_NS_SHARED;
+	else
+		set_prot = PROT_NS_SHARED;
+
+	/*
+	 * Break the mapping before we make any changes to avoid stale TLB
+	 * entries or Synchronous External Aborts caused by RIPAS_EMPTY
+	 */
+	ret = __change_memory_common(addr, PAGE_SIZE * numpages,
+				     __pgprot(set_prot),
+				     __pgprot(clear_prot | PTE_VALID));
+
+	if (ret)
+		return ret;
+
+	if (encrypt)
+		ret = rsi_set_memory_range_protected(start, end);
+	else
+		ret = rsi_set_memory_range_shared(start, end);
+
+	if (ret)
+		return ret;
+
+	return __change_memory_common(addr, PAGE_SIZE * numpages,
+				      __pgprot(PTE_VALID),
+				      __pgprot(0));
+}
+
+static int realm_set_memory_encrypted(unsigned long addr, int numpages)
+{
+	int ret = __set_memory_enc_dec(addr, numpages, true);
+
+	/*
+	 * If the request to change state fails, then the only sensible cause
+	 * of action for the caller is to leak the memory
+	 */
+	WARN(ret, "Failed to encrypt memory, %d pages will be leaked",
+	     numpages);
+
+	return ret;
+}
+
+static int realm_set_memory_decrypted(unsigned long addr, int numpages)
+{
+	int ret = __set_memory_enc_dec(addr, numpages, false);
+
+	WARN(ret, "Failed to decrypt memory, %d pages will be leaked",
+	     numpages);
+
+	return ret;
+}
+
+static const struct arm64_mem_crypt_ops realm_crypt_ops = {
+	.encrypt = realm_set_memory_encrypted,
+	.decrypt = realm_set_memory_decrypted,
+};
+
+int realm_register_memory_enc_ops(void)
+{
+	return arm64_mem_crypt_ops_register(&realm_crypt_ops);
+}
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {

From 7999edc484ca376f803562edb2d43ec921642c2a Mon Sep 17 00:00:00 2001
From: Sami Mujawar <sami.mujawar@arm.com>
Date: Thu, 17 Oct 2024 14:14:33 +0100
Subject: [PATCH 099/168] virt: arm-cca-guest: TSM_REPORT support for realms

Introduce an arm-cca-guest driver that registers with
the configfs-tsm module to provide user interfaces for
retrieving an attestation token.

When a new report is requested the arm-cca-guest driver
invokes the appropriate RSI interfaces to query an
attestation token.

The steps to retrieve an attestation token are as follows:
  1. Mount the configfs filesystem if not already mounted
     mount -t configfs none /sys/kernel/config
  2. Generate an attestation token
     report=/sys/kernel/config/tsm/report/report0
     mkdir $report
     dd if=/dev/urandom bs=64 count=1 > $report/inblob
     hexdump -C $report/outblob
     rmdir $report

Signed-off-by: Sami Mujawar <sami.mujawar@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Link: https://lore.kernel.org/r/20241017131434.40935-11-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/virt/coco/Kconfig                     |   2 +
 drivers/virt/coco/Makefile                    |   1 +
 drivers/virt/coco/arm-cca-guest/Kconfig       |  11 +
 drivers/virt/coco/arm-cca-guest/Makefile      |   2 +
 .../virt/coco/arm-cca-guest/arm-cca-guest.c   | 224 ++++++++++++++++++
 5 files changed, 240 insertions(+)
 create mode 100644 drivers/virt/coco/arm-cca-guest/Kconfig
 create mode 100644 drivers/virt/coco/arm-cca-guest/Makefile
 create mode 100644 drivers/virt/coco/arm-cca-guest/arm-cca-guest.c

diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig
index d9ff676bf48d..ff869d883d95 100644
--- a/drivers/virt/coco/Kconfig
+++ b/drivers/virt/coco/Kconfig
@@ -14,3 +14,5 @@ source "drivers/virt/coco/pkvm-guest/Kconfig"
 source "drivers/virt/coco/sev-guest/Kconfig"
 
 source "drivers/virt/coco/tdx-guest/Kconfig"
+
+source "drivers/virt/coco/arm-cca-guest/Kconfig"
diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile
index b69c30c1c720..c3d07cfc087e 100644
--- a/drivers/virt/coco/Makefile
+++ b/drivers/virt/coco/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_EFI_SECRET)	+= efi_secret/
 obj-$(CONFIG_ARM_PKVM_GUEST)	+= pkvm-guest/
 obj-$(CONFIG_SEV_GUEST)		+= sev-guest/
 obj-$(CONFIG_INTEL_TDX_GUEST)	+= tdx-guest/
+obj-$(CONFIG_ARM_CCA_GUEST)	+= arm-cca-guest/
diff --git a/drivers/virt/coco/arm-cca-guest/Kconfig b/drivers/virt/coco/arm-cca-guest/Kconfig
new file mode 100644
index 000000000000..9dd27c3ee215
--- /dev/null
+++ b/drivers/virt/coco/arm-cca-guest/Kconfig
@@ -0,0 +1,11 @@
+config ARM_CCA_GUEST
+	tristate "Arm CCA Guest driver"
+	depends on ARM64
+	default m
+	select TSM_REPORTS
+	help
+	  The driver provides userspace interface to request and
+	  attestation report from the Realm Management Monitor(RMM).
+
+	  If you choose 'M' here, this module will be called
+	  arm-cca-guest.
diff --git a/drivers/virt/coco/arm-cca-guest/Makefile b/drivers/virt/coco/arm-cca-guest/Makefile
new file mode 100644
index 000000000000..69eeba08e98a
--- /dev/null
+++ b/drivers/virt/coco/arm-cca-guest/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest.o
diff --git a/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c
new file mode 100644
index 000000000000..488153879ec9
--- /dev/null
+++ b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/cc_platform.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/tsm.h>
+#include <linux/types.h>
+
+#include <asm/rsi.h>
+
+/**
+ * struct arm_cca_token_info - a descriptor for the token buffer.
+ * @challenge:		Pointer to the challenge data
+ * @challenge_size:	Size of the challenge data
+ * @granule:		PA of the granule to which the token will be written
+ * @offset:		Offset within granule to start of buffer in bytes
+ * @result:		result of rsi_attestation_token_continue operation
+ */
+struct arm_cca_token_info {
+	void           *challenge;
+	unsigned long   challenge_size;
+	phys_addr_t     granule;
+	unsigned long   offset;
+	unsigned long   result;
+};
+
+static void arm_cca_attestation_init(void *param)
+{
+	struct arm_cca_token_info *info;
+
+	info = (struct arm_cca_token_info *)param;
+
+	info->result = rsi_attestation_token_init(info->challenge,
+						  info->challenge_size);
+}
+
+/**
+ * arm_cca_attestation_continue - Retrieve the attestation token data.
+ *
+ * @param: pointer to the arm_cca_token_info
+ *
+ * Attestation token generation is a long running operation and therefore
+ * the token data may not be retrieved in a single call. Moreover, the
+ * token retrieval operation must be requested on the same CPU on which the
+ * attestation token generation was initialised.
+ * This helper function is therefore scheduled on the same CPU multiple
+ * times until the entire token data is retrieved.
+ */
+static void arm_cca_attestation_continue(void *param)
+{
+	unsigned long len;
+	unsigned long size;
+	struct arm_cca_token_info *info;
+
+	info = (struct arm_cca_token_info *)param;
+
+	size = RSI_GRANULE_SIZE - info->offset;
+	info->result = rsi_attestation_token_continue(info->granule,
+						      info->offset, size, &len);
+	info->offset += len;
+}
+
+/**
+ * arm_cca_report_new - Generate a new attestation token.
+ *
+ * @report: pointer to the TSM report context information.
+ * @data:  pointer to the context specific data for this module.
+ *
+ * Initialise the attestation token generation using the challenge data
+ * passed in the TSM descriptor. Allocate memory for the attestation token
+ * and schedule calls to retrieve the attestation token on the same CPU
+ * on which the attestation token generation was initialised.
+ *
+ * The challenge data must be at least 32 bytes and no more than 64 bytes. If
+ * less than 64 bytes are provided it will be zero padded to 64 bytes.
+ *
+ * Return:
+ * * %0        - Attestation token generated successfully.
+ * * %-EINVAL  - A parameter was not valid.
+ * * %-ENOMEM  - Out of memory.
+ * * %-EFAULT  - Failed to get IPA for memory page(s).
+ * * A negative status code as returned by smp_call_function_single().
+ */
+static int arm_cca_report_new(struct tsm_report *report, void *data)
+{
+	int ret;
+	int cpu;
+	long max_size;
+	unsigned long token_size = 0;
+	struct arm_cca_token_info info;
+	void *buf;
+	u8 *token __free(kvfree) = NULL;
+	struct tsm_desc *desc = &report->desc;
+
+	if (desc->inblob_len < 32 || desc->inblob_len > 64)
+		return -EINVAL;
+
+	/*
+	 * The attestation token 'init' and 'continue' calls must be
+	 * performed on the same CPU. smp_call_function_single() is used
+	 * instead of simply calling get_cpu() because of the need to
+	 * allocate outblob based on the returned value from the 'init'
+	 * call and that cannot be done in an atomic context.
+	 */
+	cpu = smp_processor_id();
+
+	info.challenge = desc->inblob;
+	info.challenge_size = desc->inblob_len;
+
+	ret = smp_call_function_single(cpu, arm_cca_attestation_init,
+				       &info, true);
+	if (ret)
+		return ret;
+	max_size = info.result;
+
+	if (max_size <= 0)
+		return -EINVAL;
+
+	/* Allocate outblob */
+	token = kvzalloc(max_size, GFP_KERNEL);
+	if (!token)
+		return -ENOMEM;
+
+	/*
+	 * Since the outblob may not be physically contiguous, use a page
+	 * to bounce the buffer from RMM.
+	 */
+	buf = alloc_pages_exact(RSI_GRANULE_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* Get the PA of the memory page(s) that were allocated */
+	info.granule = (unsigned long)virt_to_phys(buf);
+
+	/* Loop until the token is ready or there is an error */
+	do {
+		/* Retrieve one RSI_GRANULE_SIZE data per loop iteration */
+		info.offset = 0;
+		do {
+			/*
+			 * Schedule a call to retrieve a sub-granule chunk
+			 * of data per loop iteration.
+			 */
+			ret = smp_call_function_single(cpu,
+						       arm_cca_attestation_continue,
+						       (void *)&info, true);
+			if (ret != 0) {
+				token_size = 0;
+				goto exit_free_granule_page;
+			}
+		} while (info.result == RSI_INCOMPLETE &&
+			 info.offset < RSI_GRANULE_SIZE);
+
+		if (info.result != RSI_SUCCESS) {
+			ret = -ENXIO;
+			token_size = 0;
+			goto exit_free_granule_page;
+		}
+
+		/*
+		 * Copy the retrieved token data from the granule
+		 * to the token buffer, ensuring that the RMM doesn't
+		 * overflow the buffer.
+		 */
+		if (WARN_ON(token_size + info.offset > max_size))
+			break;
+		memcpy(&token[token_size], buf, info.offset);
+		token_size += info.offset;
+	} while (info.result == RSI_INCOMPLETE);
+
+	report->outblob = no_free_ptr(token);
+exit_free_granule_page:
+	report->outblob_len = token_size;
+	free_pages_exact(buf, RSI_GRANULE_SIZE);
+	return ret;
+}
+
+static const struct tsm_ops arm_cca_tsm_ops = {
+	.name = KBUILD_MODNAME,
+	.report_new = arm_cca_report_new,
+};
+
+/**
+ * arm_cca_guest_init - Register with the Trusted Security Module (TSM)
+ * interface.
+ *
+ * Return:
+ * * %0        - Registered successfully with the TSM interface.
+ * * %-ENODEV  - The execution context is not an Arm Realm.
+ * * %-EBUSY   - Already registered.
+ */
+static int __init arm_cca_guest_init(void)
+{
+	int ret;
+
+	if (!is_realm_world())
+		return -ENODEV;
+
+	ret = tsm_register(&arm_cca_tsm_ops, NULL);
+	if (ret < 0)
+		pr_err("Error %d registering with TSM\n", ret);
+
+	return ret;
+}
+module_init(arm_cca_guest_init);
+
+/**
+ * arm_cca_guest_exit - unregister with the Trusted Security Module (TSM)
+ * interface.
+ */
+static void __exit arm_cca_guest_exit(void)
+{
+	tsm_unregister(&arm_cca_tsm_ops);
+}
+module_exit(arm_cca_guest_exit);
+
+MODULE_AUTHOR("Sami Mujawar <sami.mujawar@arm.com>");
+MODULE_DESCRIPTION("Arm CCA Guest TSM Driver");
+MODULE_LICENSE("GPL");

From 972d755f01954bd0e36d8696f0d7dc6466072c21 Mon Sep 17 00:00:00 2001
From: Steven Price <steven.price@arm.com>
Date: Thu, 17 Oct 2024 14:14:34 +0100
Subject: [PATCH 100/168] arm64: Document Arm Confidential Compute

Add some documentation on Arm CCA and the requirements for running Linux
as a Realm guest. Also update booting.rst to describe the requirement
for RIPAS RAM.

Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20241017131434.40935-12-steven.price@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/arm-cca.rst | 69 ++++++++++++++++++++++++++++
 Documentation/arch/arm64/booting.rst |  3 ++
 Documentation/arch/arm64/index.rst   |  1 +
 3 files changed, 73 insertions(+)
 create mode 100644 Documentation/arch/arm64/arm-cca.rst

diff --git a/Documentation/arch/arm64/arm-cca.rst b/Documentation/arch/arm64/arm-cca.rst
new file mode 100644
index 000000000000..c48b7d4ab6bd
--- /dev/null
+++ b/Documentation/arch/arm64/arm-cca.rst
@@ -0,0 +1,69 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+Arm Confidential Compute Architecture
+=====================================
+
+Arm systems that support the Realm Management Extension (RME) contain
+hardware to allow a VM guest to be run in a way which protects the code
+and data of the guest from the hypervisor. It extends the older "two
+world" model (Normal and Secure World) into four worlds: Normal, Secure,
+Root and Realm. Linux can then also be run as a guest to a monitor
+running in the Realm world.
+
+The monitor running in the Realm world is known as the Realm Management
+Monitor (RMM) and implements the Realm Management Monitor
+specification[1]. The monitor acts a bit like a hypervisor (e.g. it runs
+in EL2 and manages the stage 2 page tables etc of the guests running in
+Realm world), however much of the control is handled by a hypervisor
+running in the Normal World. The Normal World hypervisor uses the Realm
+Management Interface (RMI) defined by the RMM specification to request
+the RMM to perform operations (e.g. mapping memory or executing a vCPU).
+
+The RMM defines an environment for guests where the address space (IPA)
+is split into two. The lower half is protected - any memory that is
+mapped in this half cannot be seen by the Normal World and the RMM
+restricts what operations the Normal World can perform on this memory
+(e.g. the Normal World cannot replace pages in this region without the
+guest's cooperation). The upper half is shared, the Normal World is free
+to make changes to the pages in this region, and is able to emulate MMIO
+devices in this region too.
+
+A guest running in a Realm may also communicate with the RMM using the
+Realm Services Interface (RSI) to request changes in its environment or
+to perform attestation about its environment. In particular it may
+request that areas of the protected address space are transitioned
+between 'RAM' and 'EMPTY' (in either direction). This allows a Realm
+guest to give up memory to be returned to the Normal World, or to
+request new memory from the Normal World.  Without an explicit request
+from the Realm guest the RMM will otherwise prevent the Normal World
+from making these changes.
+
+Linux as a Realm Guest
+----------------------
+
+To run Linux as a guest within a Realm, the following must be provided
+either by the VMM or by a `boot loader` run in the Realm before Linux:
+
+ * All protected RAM described to Linux (by DT or ACPI) must be marked
+   RIPAS RAM before handing control over to Linux.
+
+ * MMIO devices must be either unprotected (e.g. emulated by the Normal
+   World) or marked RIPAS DEV.
+
+ * MMIO devices emulated by the Normal World and used very early in boot
+   (specifically earlycon) must be specified in the upper half of IPA.
+   For earlycon this can be done by specifying the address on the
+   command line, e.g. with an IPA size of 33 bits and the base address
+   of the emulated UART at 0x1000000: ``earlycon=uart,mmio,0x101000000``
+
+ * Linux will use bounce buffers for communicating with unprotected
+   devices. It will transition some protected memory to RIPAS EMPTY and
+   expect to be able to access unprotected pages at the same IPA address
+   but with the highest valid IPA bit set. The expectation is that the
+   VMM will remove the physical pages from the protected mapping and
+   provide those pages as unprotected pages.
+
+References
+----------
+[1] https://developer.arm.com/documentation/den0137/
diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst
index b57776a68f15..30164fb24a24 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -41,6 +41,9 @@ to automatically locate and size all RAM, or it may use knowledge of
 the RAM in the machine, or any other method the boot loader designer
 sees fit.)
 
+For Arm Confidential Compute Realms this includes ensuring that all
+protected RAM has a Realm IPA state (RIPAS) of "RAM".
+
 
 2. Setup the device tree
 -------------------------
diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst
index 78544de0a8a9..12c243c3af20 100644
--- a/Documentation/arch/arm64/index.rst
+++ b/Documentation/arch/arm64/index.rst
@@ -10,6 +10,7 @@ ARM64 Architecture
     acpi_object_usage
     amu
     arm-acpi
+    arm-cca
     asymmetric-32bit
     booting
     cpu-feature-registers

From 358dd4a9bdac63a0a8fb13773bfce6f599e25433 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Mon, 21 Oct 2024 19:14:34 +0100
Subject: [PATCH 101/168] arm64: Add command-line override for
 ID_AA64MMFR0_EL1.ECV

It appears that relatively popular hardware out there implements
the CNTPOFF_EL2 variant of FEAT_ECV, advertises it via ID_AA64MMFR0_EL1,
but cannot be bothered to set SCR_EL3.ECVEn to 1.

You would probably think that "this is fine, EL3 will take the
trap on access to CNTPOFF_EL2 and flip the ECVEn bit", as that's
what a semi-decent firmware implementation would do.

But no. None of that. This particular implementation takes the trap,
considers its purpose in life, decides that it has none, and *RESETS*
the system.

Yes, x1e001de, I'm talking about you.

In order to allow this machine to be promoted slightly above the
level of a glorified door-stop, add a new "id_aa64mmfr0.ecv" override.
allowing the kernel to pretend this option was never there.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20241021181434.1052974-1-maz@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/pi/idreg-override.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index 29d4b6244a6f..fbc37b2733e2 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -38,6 +38,15 @@ struct ftr_set_desc {
 
 #define FIELD(n, s, f)	{ .name = n, .shift = s, .width = 4, .filter = f }
 
+static const struct ftr_set_desc mmfr0 __prel64_initconst = {
+	.name		= "id_aa64mmfr0",
+	.override	= &id_aa64mmfr0_override,
+	.fields		= {
+		FIELD("ecv", ID_AA64MMFR0_EL1_ECV_SHIFT, NULL),
+		{}
+	},
+};
+
 static bool __init mmfr1_vh_filter(u64 val)
 {
 	/*
@@ -196,6 +205,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = {
 
 static const
 PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = {
+	{ &mmfr0	},
 	{ &mmfr1	},
 	{ &mmfr2	},
 	{ &pfr0 	},

From dca93d29845dfed60910ba13dbfb6ae6a0e19f6d Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 23 Oct 2024 00:20:45 +0100
Subject: [PATCH 102/168] kselftest/arm64: Log fp-stress child startup errors
 to stdout

Currently if we encounter an error between fork() and exec() of a child
process we log the error to stderr. This means that the errors don't get
annotated with the child information which makes diagnostics harder and
means that if we miss the exit signal from the child we can deadlock
waiting for output from the child. Improve robustness and output quality
by logging to stdout instead.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241023-arm64-fp-stress-exec-fail-v1-1-ee3c62932c15@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index e62c9dbad501..13958e645afc 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -79,7 +79,7 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = dup2(pipefd[1], 1);
 		if (ret == -1) {
-			fprintf(stderr, "dup2() %d\n", errno);
+			printf("dup2() %d\n", errno);
 			exit(EXIT_FAILURE);
 		}
 
@@ -89,7 +89,7 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = dup2(startup_pipe[0], 3);
 		if (ret == -1) {
-			fprintf(stderr, "dup2() %d\n", errno);
+			printf("dup2() %d\n", errno);
 			exit(EXIT_FAILURE);
 		}
 
@@ -107,16 +107,15 @@ static void child_start(struct child_data *child, const char *program)
 		 */
 		ret = read(3, &i, sizeof(i));
 		if (ret < 0)
-			fprintf(stderr, "read(startp pipe) failed: %s (%d)\n",
-				strerror(errno), errno);
+			printf("read(startp pipe) failed: %s (%d)\n",
+			       strerror(errno), errno);
 		if (ret > 0)
-			fprintf(stderr, "%d bytes of data on startup pipe\n",
-				ret);
+			printf("%d bytes of data on startup pipe\n", ret);
 		close(3);
 
 		ret = execl(program, program, NULL);
-		fprintf(stderr, "execl(%s) failed: %d (%s)\n",
-			program, errno, strerror(errno));
+		printf("execl(%s) failed: %d (%s)\n",
+		       program, errno, strerror(errno));
 
 		exit(EXIT_FAILURE);
 	} else {

From 0448a96e243d7ae0e2db3a633d0f2307be3aa8b7 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 21 Oct 2024 12:07:13 +0530
Subject: [PATCH 103/168] arm64/mm: Drop _PROT_SECT_DEFAULT

'commit db95ea787bd1 ("arm64: mm: Wire up TCR.DS bit to PTE shareability
fields")' dropped the last reference to symbol _PROT_SECT_DEFAULT, while
transitioning from PMD_SECT_S to PMD_MAYBE_SHARED for PROT_SECT_DEFAULT.
Hence let's just drop that symbol which is now unused.

Cc: Will Deacon <will@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20241021063713.750870-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable-prot.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 2a11d0c10760..84a099989f82 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -35,7 +35,6 @@
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
 
 #define _PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
-#define _PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
 
 #define PROT_DEFAULT		(PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF)
 #define PROT_SECT_DEFAULT	(PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF)

From 0263a1e4f5ddbdac5ad84e84e5ac75dcb20121a1 Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2@nxp.com>
Date: Tue, 24 Sep 2024 14:12:49 +0800
Subject: [PATCH 104/168] dt-bindings: perf: fsl-imx-ddr: Add i.MX91 compatible

i.MX91 has a DDR Performance Monitor Unit which is compatible with i.MX93.
This will add a compatible for i.MX91.

Signed-off-by: Xu Yang <xu.yang_2@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240924061251.3387850-1-xu.yang_2@nxp.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
index 37e8b98f2cdc..8597ea625edb 100644
--- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
+++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
@@ -31,7 +31,9 @@ properties:
           - const: fsl,imx8dxl-ddr-pmu
           - const: fsl,imx8-ddr-pmu
       - items:
-          - const: fsl,imx95-ddr-pmu
+          - enum:
+              - fsl,imx91-ddr-pmu
+              - fsl,imx95-ddr-pmu
           - const: fsl,imx93-ddr-pmu
 
   reg:

From 44798fe136dc5adc9733a5c3d1fddafd6ec942ff Mon Sep 17 00:00:00 2001
From: Xu Yang <xu.yang_2@nxp.com>
Date: Tue, 24 Sep 2024 14:12:50 +0800
Subject: [PATCH 105/168] perf: imx_perf: add support for i.MX91 platform

This will add compatible and identifier for i.MX91 platform.

Signed-off-by: Xu Yang <xu.yang_2@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20240924061251.3387850-2-xu.yang_2@nxp.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/fsl_imx9_ddr_perf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c
index 69f920b1caf2..fe1a51f64751 100644
--- a/drivers/perf/fsl_imx9_ddr_perf.c
+++ b/drivers/perf/fsl_imx9_ddr_perf.c
@@ -81,6 +81,10 @@ struct ddr_pmu {
 	int id;
 };
 
+static const struct imx_ddr_devtype_data imx91_devtype_data = {
+	.identifier = "imx91",
+};
+
 static const struct imx_ddr_devtype_data imx93_devtype_data = {
 	.identifier = "imx93",
 };
@@ -100,6 +104,7 @@ static inline bool is_imx95(struct ddr_pmu *pmu)
 }
 
 static const struct of_device_id imx_ddr_pmu_dt_ids[] = {
+	{ .compatible = "fsl,imx91-ddr-pmu", .data = &imx91_devtype_data },
 	{ .compatible = "fsl,imx93-ddr-pmu", .data = &imx93_devtype_data },
 	{ .compatible = "fsl,imx95-ddr-pmu", .data = &imx95_devtype_data },
 	{ /* sentinel */ }

From 48545b3eff6b061ed616d3ed67ca215abfb1000d Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 9 Oct 2024 19:52:08 -0700
Subject: [PATCH 106/168] perf/cxlpmu: Support missing events in 3.1 spec

Update the CXL PMU driver to support the new events introduced
in the latest revision. These are:

- read/write accesses with TEE constraints.
- S2M indicating Modified state.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
Link: https://lore.kernel.org/r/20241010025208.180458-1-dave@stgolabs.net
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/cxl_pmu.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/cxl_pmu.c b/drivers/perf/cxl_pmu.c
index 43d68b69e630..bee4b5b52ec6 100644
--- a/drivers/perf/cxl_pmu.c
+++ b/drivers/perf/cxl_pmu.c
@@ -354,7 +354,7 @@ static struct attribute *cxl_pmu_event_attrs[] = {
 	CXL_PMU_EVENT_CXL_ATTR(d2h_req_wowrinvf,		CXL_PMU_GID_D2H_REQ, BIT(13)),
 	CXL_PMU_EVENT_CXL_ATTR(d2h_req_wrinv,			CXL_PMU_GID_D2H_REQ, BIT(14)),
 	CXL_PMU_EVENT_CXL_ATTR(d2h_req_cacheflushed,		CXL_PMU_GID_D2H_REQ, BIT(16)),
-	/* CXL rev 3.0 Table 3-20 - D2H Repsonse Encodings */
+	/* CXL rev 3.0 Table 3-20 - D2H Response Encodings */
 	CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspihiti,		CXL_PMU_GID_D2H_RSP, BIT(4)),
 	CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspvhitv,		CXL_PMU_GID_D2H_RSP, BIT(6)),
 	CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspihitse,		CXL_PMU_GID_D2H_RSP, BIT(5)),
@@ -377,12 +377,14 @@ static struct attribute *cxl_pmu_event_attrs[] = {
 	/* CXL rev 3.0 Table 13-5 directly lists these */
 	CXL_PMU_EVENT_CXL_ATTR(cachedata_d2h_data,		CXL_PMU_GID_CACHE_DATA, BIT(0)),
 	CXL_PMU_EVENT_CXL_ATTR(cachedata_h2d_data,		CXL_PMU_GID_CACHE_DATA, BIT(1)),
-	/* CXL rev 3.0 Table 3-29 M2S Req Memory Opcodes */
+	/* CXL rev 3.1 Table 3-35 M2S Req Memory Opcodes */
 	CXL_PMU_EVENT_CXL_ATTR(m2s_req_meminv,			CXL_PMU_GID_M2S_REQ, BIT(0)),
 	CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrd,			CXL_PMU_GID_M2S_REQ, BIT(1)),
 	CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrddata,		CXL_PMU_GID_M2S_REQ, BIT(2)),
 	CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrdfwd,		CXL_PMU_GID_M2S_REQ, BIT(3)),
 	CXL_PMU_EVENT_CXL_ATTR(m2s_req_memwrfwd,		CXL_PMU_GID_M2S_REQ, BIT(4)),
+	CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrdtee,		CXL_PMU_GID_M2S_REQ, BIT(5)),
+	CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrddatatee,		CXL_PMU_GID_M2S_REQ, BIT(6)),
 	CXL_PMU_EVENT_CXL_ATTR(m2s_req_memspecrd,		CXL_PMU_GID_M2S_REQ, BIT(8)),
 	CXL_PMU_EVENT_CXL_ATTR(m2s_req_meminvnt,		CXL_PMU_GID_M2S_REQ, BIT(9)),
 	CXL_PMU_EVENT_CXL_ATTR(m2s_req_memcleanevict,		CXL_PMU_GID_M2S_REQ, BIT(10)),
@@ -404,10 +406,11 @@ static struct attribute *cxl_pmu_event_attrs[] = {
 	CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_curblk,		CXL_PMU_GID_S2M_BISNP, BIT(4)),
 	CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_datblk,		CXL_PMU_GID_S2M_BISNP, BIT(5)),
 	CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_invblk,		CXL_PMU_GID_S2M_BISNP, BIT(6)),
-	/* CXL rev 3.0 Table 3-43 S2M NDR Opcopdes */
+	/* CXL rev 3.1 Table 3-50 S2M NDR Opcopdes */
 	CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmp,			CXL_PMU_GID_S2M_NDR, BIT(0)),
 	CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmps,			CXL_PMU_GID_S2M_NDR, BIT(1)),
 	CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmpe,			CXL_PMU_GID_S2M_NDR, BIT(2)),
+	CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmpm,			CXL_PMU_GID_S2M_NDR, BIT(3)),
 	CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_biconflictack,		CXL_PMU_GID_S2M_NDR, BIT(4)),
 	/* CXL rev 3.0 Table 3-46 S2M DRS opcodes */
 	CXL_PMU_EVENT_CXL_ATTR(s2m_drs_memdata,			CXL_PMU_GID_S2M_DRS, BIT(0)),

From 759b5fc6cc3e3c5841f6a3e4638b39534b0fc716 Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Wed, 16 Oct 2024 14:01:36 -0700
Subject: [PATCH 107/168] perf/dwc_pcie: Convert the events with mixed case to
 lowercase

Group #1 events had both upper and lower case characters in their names.
Trying to count such events with perf tool results in an error:

$ perf stat -e dwc_rootport_10008/Tx_PCIe_TLP_Data_Payload/ sleep 1
event syntax error: 'dwc_rootport_10008/Tx_PCIe_TLP_Data_Payload/'
                     \___ Bad event or PMU

Unable to find PMU or event on a PMU of 'dwc_rootport_10008'

event syntax error: '..port_10008/Tx_PCIe_TLP_Data_Payload/'
                                  \___ unknown term 'Tx_PCIe_TLP_Data_Payload' for pmu 'dwc_rootport_10008'

valid terms: eventid,type,lane,config,config1,config2,config3,name,period,percore,metric-id

Run 'perf list' for a list of valid events

 Usage: perf stat [<options>] [<command>]

    -e, --event <event>   event selector. use 'perf list' to list available events

Perf tool assumes the event names are either in lower or upper case. This
is also mentioned in
Documentation/ABI/testing/sysfs-bus-event_source-devices-events

  "As performance monitoring event names are case
   insensitive in the perf tool, the perf tool only looks
   for lower or upper case event names in sysfs to avoid
   scanning the directory. It is therefore required the
   name of the event here is either lower or upper case."

Change the Group #1 events names to lower case.

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20241016210136.65452-1-ilkka@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/dwc_pcie_pmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
index 59526a48499f..126d2c3516ad 100644
--- a/drivers/perf/dwc_pcie_pmu.c
+++ b/drivers/perf/dwc_pcie_pmu.c
@@ -202,10 +202,10 @@ static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
 	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_AUX, 0x09),
 
 	/* Group #1 */
-	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
-	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
-	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
-	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(tx_pcie_tlp_data_payload, 0x20),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(rx_pcie_tlp_data_payload, 0x21),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(tx_ccix_tlp_data_payload, 0x22),
+	DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(rx_ccix_tlp_data_payload, 0x23),
 
 	/*
 	 * Leave it to the user to specify the lane ID to avoid generating

From 0bbff9ed81654d5f06bfca484681756ee407f924 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 2 Oct 2024 13:43:24 -0500
Subject: [PATCH 108/168] perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access
 control

Armv8.9/9.4 PMUv3.9 adds per counter EL0 access controls. Per counter
access is enabled with the UEN bit in PMUSERENR_EL1 register. Individual
counters are enabled/disabled in the PMUACR_EL1 register. When UEN is
set, the CR/ER bits control EL0 write access and must be set to disable
write access.

With the access controls, the clearing of unused counters can be
skipped.

KVM also configures PMUSERENR_EL1 in order to trap to EL2. UEN does not
need to be set for it since only PMUv3.5 is exposed to guests.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20241002184326.1105499-1-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm/include/asm/arm_pmuv3.h   |  6 ++++++
 arch/arm64/include/asm/arm_pmuv3.h | 10 ++++++++++
 arch/arm64/tools/sysreg            |  8 ++++++++
 drivers/perf/arm_pmuv3.c           | 29 +++++++++++++++++++----------
 include/linux/perf/arm_pmuv3.h     |  1 +
 5 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h
index f63ba8986b24..d242b5e1ca0d 100644
--- a/arch/arm/include/asm/arm_pmuv3.h
+++ b/arch/arm/include/asm/arm_pmuv3.h
@@ -231,6 +231,7 @@ static inline void kvm_vcpu_pmu_resync_el0(void) {}
 #define ARMV8_PMU_DFR_VER_V3P1      0x4
 #define ARMV8_PMU_DFR_VER_V3P4      0x5
 #define ARMV8_PMU_DFR_VER_V3P5      0x6
+#define ARMV8_PMU_DFR_VER_V3P9      0x9
 #define ARMV8_PMU_DFR_VER_IMP_DEF   0xF
 
 static inline bool pmuv3_implemented(int pmuver)
@@ -249,6 +250,11 @@ static inline bool is_pmuv3p5(int pmuver)
 	return pmuver >= ARMV8_PMU_DFR_VER_V3P5;
 }
 
+static inline bool is_pmuv3p9(int pmuver)
+{
+	return pmuver >= ARMV8_PMU_DFR_VER_V3P9;
+}
+
 static inline u64 read_pmceid0(void)
 {
 	u64 val = read_sysreg(PMCEID0);
diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h
index 468a049bc63b..8a777dec8d88 100644
--- a/arch/arm64/include/asm/arm_pmuv3.h
+++ b/arch/arm64/include/asm/arm_pmuv3.h
@@ -152,6 +152,11 @@ static inline void write_pmuserenr(u32 val)
 	write_sysreg(val, pmuserenr_el0);
 }
 
+static inline void write_pmuacr(u64 val)
+{
+	write_sysreg_s(val, SYS_PMUACR_EL1);
+}
+
 static inline u64 read_pmceid0(void)
 {
 	return read_sysreg(pmceid0_el0);
@@ -178,4 +183,9 @@ static inline bool is_pmuv3p5(int pmuver)
 	return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5;
 }
 
+static inline bool is_pmuv3p9(int pmuver)
+{
+	return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P9;
+}
+
 #endif
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 8d637ac4b7c6..74fb5af91d4f 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1238,6 +1238,7 @@ UnsignedEnum	11:8	PMUVer
 	0b0110	V3P5
 	0b0111	V3P7
 	0b1000	V3P8
+	0b1001	V3P9
 	0b1111	IMP_DEF
 EndEnum
 UnsignedEnum	7:4	TraceVer
@@ -2178,6 +2179,13 @@ Field	4	P
 Field	3:0	ALIGN
 EndSysreg
 
+Sysreg	PMUACR_EL1	3	0	9	14	4
+Res0	63:33
+Field	32	F0
+Field	31	C
+Field	30:0	P
+EndSysreg
+
 Sysreg	PMSELR_EL0	3	3	9	12	5
 Res0	63:5
 Field	4:0	SEL
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 0afe02f879b4..bb93d32b86ea 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -770,18 +770,27 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
 	int i;
 	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
 
-	/* Clear any unused counters to avoid leaking their contents */
-	for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask,
-			    ARMPMU_MAX_HWEVENTS) {
-		if (i == ARMV8_PMU_CYCLE_IDX)
-			write_pmccntr(0);
-		else if (i == ARMV8_PMU_INSTR_IDX)
-			write_pmicntr(0);
-		else
-			armv8pmu_write_evcntr(i, 0);
+	if (is_pmuv3p9(cpu_pmu->pmuver)) {
+		u64 mask = 0;
+		for_each_set_bit(i, cpuc->used_mask, ARMPMU_MAX_HWEVENTS) {
+			if (armv8pmu_event_has_user_read(cpuc->events[i]))
+				mask |= BIT(i);
+		}
+		write_pmuacr(mask);
+	} else {
+		/* Clear any unused counters to avoid leaking their contents */
+		for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask,
+				    ARMPMU_MAX_HWEVENTS) {
+			if (i == ARMV8_PMU_CYCLE_IDX)
+				write_pmccntr(0);
+			else if (i == ARMV8_PMU_INSTR_IDX)
+				write_pmicntr(0);
+			else
+				armv8pmu_write_evcntr(i, 0);
+		}
 	}
 
-	update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR);
+	update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_UEN);
 }
 
 static void armv8pmu_enable_event(struct perf_event *event)
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 3372c1b56486..d698efba28a2 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -257,6 +257,7 @@
 #define ARMV8_PMU_USERENR_SW	(1 << 1) /* PMSWINC can be written at EL0 */
 #define ARMV8_PMU_USERENR_CR	(1 << 2) /* Cycle counter can be read at EL0 */
 #define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
+#define ARMV8_PMU_USERENR_UEN	(1 << 4) /* Fine grained per counter access at EL0 */
 /* Mask for writable bits */
 #define ARMV8_PMU_USERENR_MASK	(ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \
 				 ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER)

From e1dce56443a4a18978fe39ee4af663e5b6b31422 Mon Sep 17 00:00:00 2001
From: Gowthami Thiagarajan <gthiagarajan@marvell.com>
Date: Mon, 28 Oct 2024 11:23:09 +0530
Subject: [PATCH 109/168] perf/marvell: Marvell PEM performance monitor support

PCI Express Interface PMU includes various performance counters
to monitor the data that is transmitted over the PCIe link. The
counters track various inbound and outbound transactions which
includes separate counters for posted/non-posted/completion TLPs.
Also, inbound and outbound memory read requests along with their
latencies can also be monitored. Address Translation Services(ATS)events
such as ATS Translation, ATS Page Request, ATS Invalidation along with
their corresponding latencies are also supported.

The performance counters are 64 bits wide.

For instance,
perf stat -e ib_tlp_pr <workload>
tracks the inbound posted TLPs for the workload.

Co-developed-by: Linu Cherian <lcherian@marvell.com>
Signed-off-by: Linu Cherian <lcherian@marvell.com>
Signed-off-by: Gowthami Thiagarajan <gthiagarajan@marvell.com>
Link: https://lore.kernel.org/r/20241028055309.17893-1-gthiagarajan@marvell.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/admin-guide/perf/index.rst      |   1 +
 .../admin-guide/perf/mrvl-pem-pmu.rst         |  56 +++
 MAINTAINERS                                   |   6 +
 drivers/perf/Kconfig                          |   7 +
 drivers/perf/Makefile                         |   1 +
 drivers/perf/marvell_pem_pmu.c                | 425 ++++++++++++++++++
 include/linux/cpuhotplug.h                    |   1 +
 7 files changed, 497 insertions(+)
 create mode 100644 Documentation/admin-guide/perf/mrvl-pem-pmu.rst
 create mode 100644 drivers/perf/marvell_pem_pmu.c

diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index 8502bc174640..a58bd3f7e190 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -26,3 +26,4 @@ Performance monitor support
    meson-ddr-pmu
    cxl
    ampere_cspmu
+   mrvl-pem-pmu
diff --git a/Documentation/admin-guide/perf/mrvl-pem-pmu.rst b/Documentation/admin-guide/perf/mrvl-pem-pmu.rst
new file mode 100644
index 000000000000..c39007149b97
--- /dev/null
+++ b/Documentation/admin-guide/perf/mrvl-pem-pmu.rst
@@ -0,0 +1,56 @@
+=================================================================
+Marvell Odyssey PEM Performance Monitoring Unit (PMU UNCORE)
+=================================================================
+
+The PCI Express Interface Units(PEM) are associated with a corresponding
+monitoring unit. This includes performance counters to track various
+characteristics of the data that is transmitted over the PCIe link.
+
+The counters track inbound and outbound transactions which
+includes separate counters for posted/non-posted/completion TLPs.
+Also, inbound and outbound memory read requests along with their
+latencies can also be monitored. Address Translation Services(ATS)events
+such as ATS Translation, ATS Page Request, ATS Invalidation along with
+their corresponding latencies are also tracked.
+
+There are separate 64 bit counters to measure posted/non-posted/completion
+tlps in inbound and outbound transactions. ATS events are measured by
+different counters.
+
+The PMU driver exposes the available events and format options under sysfs,
+/sys/bus/event_source/devices/mrvl_pcie_rc_pmu_<>/events/
+/sys/bus/event_source/devices/mrvl_pcie_rc_pmu_<>/format/
+
+Examples::
+
+  # perf list | grep mrvl_pcie_rc_pmu
+  mrvl_pcie_rc_pmu_<>/ats_inv/             [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_inv_latency/     [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_pri/             [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_pri_latency/     [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_trans/           [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ats_trans_latency/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_inflight/         [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_reads/            [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_req_no_ro_ebus/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_req_no_ro_ncb/    [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_cpl_partid/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_cpl_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_npr/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_pr/    [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_npr/          [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ib_tlp_pr/           [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_inflight_partid/  [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_merges_cpl_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_merges_npr_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_merges_pr_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_reads_partid/     [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_cpl_partid/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_cpl_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_npr_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_pr_partid/ [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_npr_partid/   [Kernel PMU event]
+  mrvl_pcie_rc_pmu_<>/ob_tlp_pr_partid/    [Kernel PMU event]
+
+
+  # perf stat -e ib_inflight,ib_reads,ib_req_no_ro_ebus,ib_req_no_ro_ncb <workload>
diff --git a/MAINTAINERS b/MAINTAINERS
index 7ad507f49324..aaf2cfbf1d7c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13845,6 +13845,12 @@ S:	Supported
 F:	Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst
 F:	drivers/net/ethernet/marvell/octeontx2/af/
 
+MARVELL PEM PMU DRIVER
+M:	Linu Cherian <lcherian@marvell.com>
+M:	Gowthami Thiagarajan <gthiagarajan@marvell.com>
+S:	Supported
+F:	drivers/perf/marvell_pem_pmu.c
+
 MARVELL PRESTERA ETHERNET SWITCH DRIVER
 M:	Taras Chornyi <taras.chornyi@plvision.eu>
 S:	Supported
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index bab8ba64162f..4e268de351c4 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -284,4 +284,11 @@ config CXL_PMU
 
 	  If unsure say 'm'.
 
+config MARVELL_PEM_PMU
+	tristate "MARVELL PEM PMU Support"
+	depends on ARCH_THUNDER || (COMPILE_TEST && 64BIT)
+	help
+	  Enable support for PCIe Interface performance monitoring
+	  on Marvell platform.
+
 endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 8268f38e42c5..de71d2574857 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
 obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
 obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
+obj-$(CONFIG_MARVELL_PEM_PMU) += marvell_pem_pmu.o
 obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
 obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o
 obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
diff --git a/drivers/perf/marvell_pem_pmu.c b/drivers/perf/marvell_pem_pmu.c
new file mode 100644
index 000000000000..29fbcd1848e4
--- /dev/null
+++ b/drivers/perf/marvell_pem_pmu.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Marvell PEM(PCIe RC) Performance Monitor Driver
+ *
+ * Copyright (C) 2024 Marvell.
+ */
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+/*
+ * Each of these events maps to a free running 64 bit counter
+ * with no event control, but can be reset.
+ */
+enum pem_events {
+	IB_TLP_NPR,
+	IB_TLP_PR,
+	IB_TLP_CPL,
+	IB_TLP_DWORDS_NPR,
+	IB_TLP_DWORDS_PR,
+	IB_TLP_DWORDS_CPL,
+	IB_INFLIGHT,
+	IB_READS,
+	IB_REQ_NO_RO_NCB,
+	IB_REQ_NO_RO_EBUS,
+	OB_TLP_NPR,
+	OB_TLP_PR,
+	OB_TLP_CPL,
+	OB_TLP_DWORDS_NPR,
+	OB_TLP_DWORDS_PR,
+	OB_TLP_DWORDS_CPL,
+	OB_INFLIGHT,
+	OB_READS,
+	OB_MERGES_NPR,
+	OB_MERGES_PR,
+	OB_MERGES_CPL,
+	ATS_TRANS,
+	ATS_TRANS_LATENCY,
+	ATS_PRI,
+	ATS_PRI_LATENCY,
+	ATS_INV,
+	ATS_INV_LATENCY,
+	PEM_EVENTIDS_MAX
+};
+
+static u64 eventid_to_offset_table[] = {
+	[IB_TLP_NPR]	     = 0x0,
+	[IB_TLP_PR]	     = 0x8,
+	[IB_TLP_CPL]	     = 0x10,
+	[IB_TLP_DWORDS_NPR]  = 0x100,
+	[IB_TLP_DWORDS_PR]   = 0x108,
+	[IB_TLP_DWORDS_CPL]  = 0x110,
+	[IB_INFLIGHT]	     = 0x200,
+	[IB_READS]	     = 0x300,
+	[IB_REQ_NO_RO_NCB]   = 0x400,
+	[IB_REQ_NO_RO_EBUS]  = 0x408,
+	[OB_TLP_NPR]         = 0x500,
+	[OB_TLP_PR]          = 0x508,
+	[OB_TLP_CPL]         = 0x510,
+	[OB_TLP_DWORDS_NPR]  = 0x600,
+	[OB_TLP_DWORDS_PR]   = 0x608,
+	[OB_TLP_DWORDS_CPL]  = 0x610,
+	[OB_INFLIGHT]        = 0x700,
+	[OB_READS]	     = 0x800,
+	[OB_MERGES_NPR]      = 0x900,
+	[OB_MERGES_PR]       = 0x908,
+	[OB_MERGES_CPL]      = 0x910,
+	[ATS_TRANS]          = 0x2D18,
+	[ATS_TRANS_LATENCY]  = 0x2D20,
+	[ATS_PRI]            = 0x2D28,
+	[ATS_PRI_LATENCY]    = 0x2D30,
+	[ATS_INV]            = 0x2D38,
+	[ATS_INV_LATENCY]    = 0x2D40,
+};
+
+struct pem_pmu {
+	struct pmu pmu;
+	void __iomem *base;
+	unsigned int cpu;
+	struct	device *dev;
+	struct hlist_node node;
+};
+
+#define to_pem_pmu(p)	container_of(p, struct pem_pmu, pmu)
+
+static int eventid_to_offset(int eventid)
+{
+	return eventid_to_offset_table[eventid];
+}
+
+/* Events */
+static ssize_t pem_pmu_event_show(struct device *dev,
+				  struct device_attribute *attr,
+				  char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define PEM_EVENT_ATTR(_name, _id)					\
+	(&((struct perf_pmu_events_attr[]) {				\
+	{ .attr = __ATTR(_name, 0444, pem_pmu_event_show, NULL),	\
+		.id = _id, }						\
+	})[0].attr.attr)
+
+static struct attribute *pem_perf_events_attrs[] = {
+	PEM_EVENT_ATTR(ib_tlp_npr, IB_TLP_NPR),
+	PEM_EVENT_ATTR(ib_tlp_pr, IB_TLP_PR),
+	PEM_EVENT_ATTR(ib_tlp_cpl_partid, IB_TLP_CPL),
+	PEM_EVENT_ATTR(ib_tlp_dwords_npr, IB_TLP_DWORDS_NPR),
+	PEM_EVENT_ATTR(ib_tlp_dwords_pr, IB_TLP_DWORDS_PR),
+	PEM_EVENT_ATTR(ib_tlp_dwords_cpl_partid, IB_TLP_DWORDS_CPL),
+	PEM_EVENT_ATTR(ib_inflight, IB_INFLIGHT),
+	PEM_EVENT_ATTR(ib_reads, IB_READS),
+	PEM_EVENT_ATTR(ib_req_no_ro_ncb, IB_REQ_NO_RO_NCB),
+	PEM_EVENT_ATTR(ib_req_no_ro_ebus, IB_REQ_NO_RO_EBUS),
+	PEM_EVENT_ATTR(ob_tlp_npr_partid, OB_TLP_NPR),
+	PEM_EVENT_ATTR(ob_tlp_pr_partid, OB_TLP_PR),
+	PEM_EVENT_ATTR(ob_tlp_cpl_partid, OB_TLP_CPL),
+	PEM_EVENT_ATTR(ob_tlp_dwords_npr_partid, OB_TLP_DWORDS_NPR),
+	PEM_EVENT_ATTR(ob_tlp_dwords_pr_partid, OB_TLP_DWORDS_PR),
+	PEM_EVENT_ATTR(ob_tlp_dwords_cpl_partid, OB_TLP_DWORDS_CPL),
+	PEM_EVENT_ATTR(ob_inflight_partid, OB_INFLIGHT),
+	PEM_EVENT_ATTR(ob_reads_partid, OB_READS),
+	PEM_EVENT_ATTR(ob_merges_npr_partid, OB_MERGES_NPR),
+	PEM_EVENT_ATTR(ob_merges_pr_partid, OB_MERGES_PR),
+	PEM_EVENT_ATTR(ob_merges_cpl_partid, OB_MERGES_CPL),
+	PEM_EVENT_ATTR(ats_trans, ATS_TRANS),
+	PEM_EVENT_ATTR(ats_trans_latency, ATS_TRANS_LATENCY),
+	PEM_EVENT_ATTR(ats_pri, ATS_PRI),
+	PEM_EVENT_ATTR(ats_pri_latency, ATS_PRI_LATENCY),
+	PEM_EVENT_ATTR(ats_inv, ATS_INV),
+	PEM_EVENT_ATTR(ats_inv_latency, ATS_INV_LATENCY),
+	NULL
+};
+
+static struct attribute_group pem_perf_events_attr_group = {
+	.name = "events",
+	.attrs = pem_perf_events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-5");
+
+static struct attribute *pem_perf_format_attrs[] = {
+	&format_attr_event.attr,
+	NULL
+};
+
+static struct attribute_group pem_perf_format_attr_group = {
+	.name = "format",
+	.attrs = pem_perf_format_attrs,
+};
+
+/* cpumask */
+static ssize_t pem_perf_cpumask_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct pem_pmu *pmu = dev_get_drvdata(dev);
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pmu->cpu));
+}
+
+static struct device_attribute pem_perf_cpumask_attr =
+	__ATTR(cpumask, 0444, pem_perf_cpumask_show, NULL);
+
+static struct attribute *pem_perf_cpumask_attrs[] = {
+	&pem_perf_cpumask_attr.attr,
+	NULL
+};
+
+static struct attribute_group pem_perf_cpumask_attr_group = {
+	.attrs = pem_perf_cpumask_attrs,
+};
+
+static const struct attribute_group *pem_perf_attr_groups[] = {
+	&pem_perf_events_attr_group,
+	&pem_perf_cpumask_attr_group,
+	&pem_perf_format_attr_group,
+	NULL
+};
+
+static int pem_perf_event_init(struct perf_event *event)
+{
+	struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_event *sibling;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (event->attr.config >= PEM_EVENTIDS_MAX)
+		return -EINVAL;
+
+	if (is_sampling_event(event) ||
+	    event->attach_state & PERF_ATTACH_TASK) {
+		return -EOPNOTSUPP;
+	}
+
+	if (event->cpu < 0)
+		return -EOPNOTSUPP;
+
+	/*  We must NOT create groups containing mixed PMUs */
+	if (event->group_leader->pmu != event->pmu &&
+	    !is_software_event(event->group_leader))
+		return -EINVAL;
+
+	for_each_sibling_event(sibling, event->group_leader) {
+		if (sibling->pmu != event->pmu &&
+		    !is_software_event(sibling))
+			return -EINVAL;
+	}
+	/*
+	 * Set ownership of event to one CPU, same event can not be observed
+	 * on multiple cpus at same time.
+	 */
+	event->cpu = pmu->cpu;
+	hwc->idx = -1;
+	return 0;
+}
+
+static u64 pem_perf_read_counter(struct pem_pmu *pmu,
+				 struct perf_event *event, int eventid)
+{
+	return readq_relaxed(pmu->base + eventid_to_offset(eventid));
+}
+
+static void pem_perf_event_update(struct perf_event *event)
+{
+	struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_count, new_count;
+
+	do {
+		prev_count = local64_read(&hwc->prev_count);
+		new_count = pem_perf_read_counter(pmu, event, hwc->idx);
+	} while (local64_xchg(&hwc->prev_count, new_count) != prev_count);
+
+	local64_add((new_count - prev_count), &event->count);
+}
+
+static void pem_perf_event_start(struct perf_event *event, int flags)
+{
+	struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int eventid = hwc->idx;
+
+	/*
+	 * All counters are free-running and associated with
+	 * a fixed event to track in Hardware
+	 */
+	local64_set(&hwc->prev_count,
+		    pem_perf_read_counter(pmu, event, eventid));
+
+	hwc->state = 0;
+}
+
+static int pem_perf_event_add(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = event->attr.config;
+	if (WARN_ON_ONCE(hwc->idx >= PEM_EVENTIDS_MAX))
+		return -EINVAL;
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (flags & PERF_EF_START)
+		pem_perf_event_start(event, flags);
+
+	return 0;
+}
+
+static void pem_perf_event_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (flags & PERF_EF_UPDATE)
+		pem_perf_event_update(event);
+
+	hwc->state |= PERF_HES_STOPPED;
+}
+
+static void pem_perf_event_del(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	pem_perf_event_stop(event, PERF_EF_UPDATE);
+	hwc->idx = -1;
+}
+
+static int pem_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct pem_pmu *pmu = hlist_entry_safe(node, struct pem_pmu, node);
+	unsigned int target;
+
+	if (cpu != pmu->cpu)
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	perf_pmu_migrate_context(&pmu->pmu, cpu, target);
+	pmu->cpu = target;
+	return 0;
+}
+
+static int pem_perf_probe(struct platform_device *pdev)
+{
+	struct pem_pmu *pem_pmu;
+	struct resource *res;
+	void __iomem *base;
+	char *name;
+	int ret;
+
+	pem_pmu = devm_kzalloc(&pdev->dev, sizeof(*pem_pmu), GFP_KERNEL);
+	if (!pem_pmu)
+		return -ENOMEM;
+
+	pem_pmu->dev = &pdev->dev;
+	platform_set_drvdata(pdev, pem_pmu);
+
+	base = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	pem_pmu->base = base;
+
+	pem_pmu->pmu = (struct pmu) {
+		.module	      = THIS_MODULE,
+		.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+		.task_ctx_nr = perf_invalid_context,
+		.attr_groups = pem_perf_attr_groups,
+		.event_init  = pem_perf_event_init,
+		.add	     = pem_perf_event_add,
+		.del	     = pem_perf_event_del,
+		.start	     = pem_perf_event_start,
+		.stop	     = pem_perf_event_stop,
+		.read	     = pem_perf_event_update,
+	};
+
+	/* Choose this cpu to collect perf data */
+	pem_pmu->cpu = raw_smp_processor_id();
+
+	name = devm_kasprintf(pem_pmu->dev, GFP_KERNEL, "mrvl_pcie_rc_pmu_%llx",
+			      res->start);
+	if (!name)
+		return -ENOMEM;
+
+	cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+					 &pem_pmu->node);
+
+	ret = perf_pmu_register(&pem_pmu->pmu, name, -1);
+	if (ret)
+		goto error;
+
+	return 0;
+error:
+	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+					    &pem_pmu->node);
+	return ret;
+}
+
+static void pem_perf_remove(struct platform_device *pdev)
+{
+	struct pem_pmu *pem_pmu = platform_get_drvdata(pdev);
+
+	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+					    &pem_pmu->node);
+
+	perf_pmu_unregister(&pem_pmu->pmu);
+}
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id pem_pmu_acpi_match[] = {
+	{"MRVL000E", 0},
+	{}
+};
+MODULE_DEVICE_TABLE(acpi, pem_pmu_acpi_match);
+#endif
+
+static struct platform_driver pem_pmu_driver = {
+	.driver	= {
+		.name   = "pem-pmu",
+		.acpi_match_table = ACPI_PTR(pem_pmu_acpi_match),
+		.suppress_bind_attrs = true,
+	},
+	.probe		= pem_perf_probe,
+	.remove		= pem_perf_remove,
+};
+
+static int __init pem_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+				      "perf/marvell/pem:online", NULL,
+				       pem_pmu_offline_cpu);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&pem_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE);
+	return ret;
+}
+
+static void __exit pem_pmu_exit(void)
+{
+	platform_driver_unregister(&pem_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE);
+}
+
+module_init(pem_pmu_init);
+module_exit(pem_pmu_exit);
+
+MODULE_DESCRIPTION("Marvell PEM Perf driver");
+MODULE_AUTHOR("Gowthami Thiagarajan <gthiagarajan@marvell.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2361ed4d2b15..61d9a66d1807 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -227,6 +227,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE,
 	CPUHP_AP_PERF_ARM_CAVIUM_TX2_UNCORE_ONLINE,
 	CPUHP_AP_PERF_ARM_MARVELL_CN10K_DDR_ONLINE,
+	CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
 	CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,

From bdc9a64c8b2041edfbcf02d07ee7402df65918e9 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Tue, 29 Oct 2024 07:06:02 -0500
Subject: [PATCH 110/168] ARM: pmuv3: Add missing write_pmuacr()

Fix compilation on Arm by adding missing static inline write_pmuacr()
declaration.

Fixes: 0bbff9ed8165 ("perf/arm_pmuv3: Add PMUv3.9 per counter EL0 access control")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202410291954.NiHLIwSC-lkp@intel.com/
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20241029120602.4061566-2-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm/include/asm/arm_pmuv3.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h
index d242b5e1ca0d..2ec0e5e83fc9 100644
--- a/arch/arm/include/asm/arm_pmuv3.h
+++ b/arch/arm/include/asm/arm_pmuv3.h
@@ -212,6 +212,8 @@ static inline void write_pmuserenr(u32 val)
 	write_sysreg(val, PMUSERENR);
 }
 
+static inline void write_pmuacr(u64 val) {}
+
 static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
 static inline void kvm_clr_pmu_events(u32 clr) {}
 static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)

From 83d511c3ca0cb70a55d8b0ae3e753448fb00272b Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Tue, 8 Oct 2024 23:18:22 +0000
Subject: [PATCH 111/168] perf/dwc_pcie: Add support for Ampere SoCs

Add support for Ampere SoCs by adding Ampere's vendor ID to the
vendor list.

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20241008231824.5102-2-ilkka@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/dwc_pcie_pmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
index 126d2c3516ad..edb5a809c928 100644
--- a/drivers/perf/dwc_pcie_pmu.c
+++ b/drivers/perf/dwc_pcie_pmu.c
@@ -106,6 +106,7 @@ struct dwc_pcie_vendor_id {
 
 static const struct dwc_pcie_vendor_id dwc_pcie_vendor_ids[] = {
 	{.vendor_id = PCI_VENDOR_ID_ALIBABA },
+	{.vendor_id = PCI_VENDOR_ID_AMPERE },
 	{.vendor_id = PCI_VENDOR_ID_QCOM },
 	{} /* terminator */
 };

From 94b3ad10c2e1d0e761756a844b78a21101dd1810 Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Tue, 8 Oct 2024 23:18:24 +0000
Subject: [PATCH 112/168] perf/dwc_pcie: Fix typos in event names

Fix a few typos in event names

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Reviewed-by: Jing Zhang <renyu.zj@linux.alibaba.com>
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Link: https://lore.kernel.org/r/20241008231824.5102-4-ilkka@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/dwc_pcie_pmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
index edb5a809c928..9cbea9675e21 100644
--- a/drivers/perf/dwc_pcie_pmu.c
+++ b/drivers/perf/dwc_pcie_pmu.c
@@ -216,9 +216,9 @@ static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
 	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_update_fc_dllp, 0x601),
 	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ack_dllp, 0x602),
 	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_update_fc_dllp, 0x603),
-	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nulified_tlp, 0x604),
-	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nulified_tlp, 0x605),
-	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tl, 0x606),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nullified_tlp, 0x604),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nullified_tlp, 0x605),
+	DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tlp, 0x606),
 	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_write, 0x700),
 	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_read, 0x701),
 	DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_write, 0x702),

From 3930c88ad0a5a8538ef8a281d4f4c230fb4d4fc4 Mon Sep 17 00:00:00 2001
From: Markuss Broks <markuss.broks@gmail.com>
Date: Sat, 26 Oct 2024 23:28:08 +0300
Subject: [PATCH 113/168] dt-bindings: arm: pmu: Add Samsung Mongoose core
 compatible

Add the compatible for the Samsung Mongoose CPU PMU to the schema.

Co-developed-by: Maksym Holovach <nergzd@nergzd723.xyz>
Signed-off-by: Maksym Holovach <nergzd@nergzd723.xyz>
Signed-off-by: Markuss Broks <markuss.broks@gmail.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20241026-mongoose-pmu-v1-1-f1a7448054be@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/devicetree/bindings/arm/pmu.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/arm/pmu.yaml b/Documentation/devicetree/bindings/arm/pmu.yaml
index 528544d0a161..a148ff54f2b8 100644
--- a/Documentation/devicetree/bindings/arm/pmu.yaml
+++ b/Documentation/devicetree/bindings/arm/pmu.yaml
@@ -74,6 +74,7 @@ properties:
           - qcom,krait-pmu
           - qcom,scorpion-pmu
           - qcom,scorpion-mp-pmu
+          - samsung,mongoose-pmu
 
   interrupts:
     # Don't know how many CPUs, so no constraints to specify

From 9643aaa194737b1ad73f9da6e51fc63d1c7a864a Mon Sep 17 00:00:00 2001
From: Markuss Broks <markuss.broks@gmail.com>
Date: Sat, 26 Oct 2024 23:28:09 +0300
Subject: [PATCH 114/168] perf: arm_pmuv3: Add support for Samsung Mongoose PMU

Add support for the Samsung Mongoose CPU core PMU.

This just adds the names and links to DT compatible strings.

Co-developed-by: Maksym Holovach <nergzd@nergzd723.xyz>
Signed-off-by: Maksym Holovach <nergzd@nergzd723.xyz>
Signed-off-by: Markuss Broks <markuss.broks@gmail.com>
Link: https://lore.kernel.org/r/20241026-mongoose-pmu-v1-2-f1a7448054be@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmuv3.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index bb93d32b86ea..b5cc11abc962 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -1373,6 +1373,8 @@ PMUV3_INIT_SIMPLE(armv8_neoverse_v3ae)
 PMUV3_INIT_SIMPLE(armv8_nvidia_carmel)
 PMUV3_INIT_SIMPLE(armv8_nvidia_denver)
 
+PMUV3_INIT_SIMPLE(armv8_samsung_mongoose)
+
 PMUV3_INIT_MAP_EVENT(armv8_cortex_a35, armv8_a53_map_event)
 PMUV3_INIT_MAP_EVENT(armv8_cortex_a53, armv8_a53_map_event)
 PMUV3_INIT_MAP_EVENT(armv8_cortex_a57, armv8_a57_map_event)
@@ -1418,6 +1420,7 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = {
 	{.compatible = "brcm,vulcan-pmu",	.data = armv8_brcm_vulcan_pmu_init},
 	{.compatible = "nvidia,carmel-pmu",	.data = armv8_nvidia_carmel_pmu_init},
 	{.compatible = "nvidia,denver-pmu",	.data = armv8_nvidia_denver_pmu_init},
+	{.compatible = "samsung,mongoose-pmu",	.data = armv8_samsung_mongoose_pmu_init},
 	{},
 };
 

From 2cfdb799dc7681a93844e5019f9bbff603c2c9ee Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko@arm.com>
Date: Mon, 28 Oct 2024 18:57:21 +0000
Subject: [PATCH 115/168] arm64: mops: Document requirements for hypervisors

Add a mops.rst document to clarify in more detail what hypervisors need
to do to run a Linux guest on a system with FEAT_MOPS.

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
Link: https://lore.kernel.org/r/20241028185721.52852-1-kristina.martsenko@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/booting.rst |  4 +--
 Documentation/arch/arm64/index.rst   |  1 +
 Documentation/arch/arm64/mops.rst    | 44 ++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/arch/arm64/mops.rst

diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst
index db46af5b9f0f..dabd279dee5d 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -385,8 +385,8 @@ Before jumping into the kernel, the following conditions must be met:
 
     - HCRX_EL2.MSCEn (bit 11) must be initialised to 0b1.
 
-    - HCRX_EL2.MCE2 (bit 10) must be initialised to 0b1. The exception
-      handler must set PSTATE.SS to 0b0.
+    - HCRX_EL2.MCE2 (bit 10) must be initialised to 0b1 and the hypervisor
+      must handle MOPS exceptions as described in :ref:`arm64_mops_hyp`.
 
   For CPUs with the Extended Translation Control Register feature (FEAT_TCR2):
 
diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst
index 78544de0a8a9..463de5855e84 100644
--- a/Documentation/arch/arm64/index.rst
+++ b/Documentation/arch/arm64/index.rst
@@ -20,6 +20,7 @@ ARM64 Architecture
     legacy_instructions
     memory
     memory-tagging-extension
+    mops
     perf
     pointer-authentication
     ptdump
diff --git a/Documentation/arch/arm64/mops.rst b/Documentation/arch/arm64/mops.rst
new file mode 100644
index 000000000000..2ef5b147f8dc
--- /dev/null
+++ b/Documentation/arch/arm64/mops.rst
@@ -0,0 +1,44 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================
+Memory copy/set instructions (MOPS)
+===================================
+
+A MOPS memory copy/set operation consists of three consecutive CPY* or SET*
+instructions: a prologue, main and epilogue (for example: CPYP, CPYM, CPYE).
+
+A main or epilogue instruction can take a MOPS exception for various reasons,
+for example when a task is migrated to a CPU with a different MOPS
+implementation, or when the instruction's alignment and size requirements are
+not met. The software exception handler is then expected to reset the registers
+and restart execution from the prologue instruction. Normally this is handled
+by the kernel.
+
+For more details refer to "D1.3.5.7 Memory Copy and Memory Set exceptions" in
+the Arm Architecture Reference Manual DDI 0487K.a (Arm ARM).
+
+.. _arm64_mops_hyp:
+
+Hypervisor requirements
+-----------------------
+
+A hypervisor running a Linux guest must handle all MOPS exceptions from the
+guest kernel, as Linux may not be able to handle the exception at all times.
+For example, a MOPS exception can be taken when the hypervisor migrates a vCPU
+to another physical CPU with a different MOPS implementation.
+
+To do this, the hypervisor must:
+
+  - Set HCRX_EL2.MCE2 to 1 so that the exception is taken to the hypervisor.
+
+  - Have an exception handler that implements the algorithm from the Arm ARM
+    rules CNTMJ and MWFQH.
+
+  - Set the guest's PSTATE.SS to 0 in the exception handler, to handle a
+    potential step of the current instruction.
+
+    Note: Clearing PSTATE.SS is needed so that a single step exception is taken
+    on the next instruction (the prologue instruction). Otherwise prologue
+    would get silently stepped over and the single step exception taken on the
+    main instruction. Note that if the guest instruction is not being stepped
+    then clearing PSTATE.SS has no effect.

From f8192813dcbe6d27e7031d9d353ea1fcc67052fd Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 29 Oct 2024 10:15:29 +0530
Subject: [PATCH 116/168] arm64/mm: Re-organize arch_make_huge_pte()

Core HugeTLB defines a fallback definition for arch_make_huge_pte(), which
calls platform provided pte_mkhuge(). But if any platform already provides
an override for arch_make_huge_pte(), then it does not need to provide the
helper pte_mkhuge().

arm64 override for arch_make_huge_pte() calls pte_mkhuge() internally, thus
creating an impression, that both of these callbacks are being used in core
HugeTLB and hence required to be defined. This drops off pte_mkhuge() which
was never required to begin with as there could not be any section mappings
at the PTE level. Re-organize arch_make_huge_pte() based on requested page
size and create the entry for the applicable page table level as needed. It
also removes a redundancy of clearing PTE_TABLE_BIT bit followed by setting
both PTE_TABLE_BIT and PTE_VALID bits (via PTE_TYPE_MASK) in the pte, while
creating CONT_PTE_SIZE size entries.

Cc: Will Deacon <will@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20241029044529.2624785-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable.h |  5 -----
 arch/arm64/mm/hugetlbpage.c      | 21 ++++++++++++++++-----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d56dbe5742d6..609fd4447b19 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -438,11 +438,6 @@ static inline void __set_ptes(struct mm_struct *mm,
 	}
 }
 
-/*
- * Huge pte definitions.
- */
-#define pte_mkhuge(pte)		(__pte(pte_val(pte) & ~PTE_TABLE_BIT))
-
 /*
  * Hugetlb definitions.
  */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 5f1e2103888b..3215adf48a1b 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -361,14 +361,25 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
 {
 	size_t pagesize = 1UL << shift;
 
-	entry = pte_mkhuge(entry);
-	if (pagesize == CONT_PTE_SIZE) {
-		entry = pte_mkcont(entry);
-	} else if (pagesize == CONT_PMD_SIZE) {
+	switch (pagesize) {
+#ifndef __PAGETABLE_PMD_FOLDED
+	case PUD_SIZE:
+		entry = pud_pte(pud_mkhuge(pte_pud(entry)));
+		break;
+#endif
+	case CONT_PMD_SIZE:
 		entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
-	} else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) {
+		fallthrough;
+	case PMD_SIZE:
+		entry = pmd_pte(pmd_mkhuge(pte_pmd(entry)));
+		break;
+	case CONT_PTE_SIZE:
+		entry = pte_mkcont(entry);
+		break;
+	default:
 		pr_warn("%s: unrecognized huge page size 0x%lx\n",
 			__func__, pagesize);
+		break;
 	}
 	return entry;
 }

From 9a0e3b92b02e7a921b22f4d00d87e3506d06b92a Mon Sep 17 00:00:00 2001
From: Liao Chang <liaochang1@huawei.com>
Date: Thu, 24 Oct 2024 03:41:20 +0000
Subject: [PATCH 117/168] arm64: Return early when break handler is found on
 linked-list

The search for breakpoint handlers iterate through the entire
linked list. Given that all registered hook has a valid fn field, and no
registered hooks share the same mask and imm. This commit optimize the
efficiency slightly by returning early as a matching handler is found.

Signed-off-by: Liao Chang <liaochang1@huawei.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241024034120.3814224-1-liaochang1@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/debug-monitors.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 024a7b245056..4713a4c65b1b 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -303,7 +303,6 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr)
 {
 	struct break_hook *hook;
 	struct list_head *list;
-	int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL;
 
 	list = user_mode(regs) ? &user_break_hook : &kernel_break_hook;
 
@@ -313,10 +312,10 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr)
 	 */
 	list_for_each_entry_rcu(hook, list, node) {
 		if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm)
-			fn = hook->fn;
+			return hook->fn(regs, esr);
 	}
 
-	return fn ? fn(regs, esr) : DBG_HOOK_ERROR;
+	return DBG_HOOK_ERROR;
 }
 NOKPROBE_SYMBOL(call_break_hook);
 

From 17a2409783f141c9fcd03b140d17bbb75e98c630 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 29 Oct 2024 12:34:21 +0000
Subject: [PATCH 118/168] kselftest/arm64: Use ksft_perror() to log MTE
 failures

The logging in the allocation helpers variously uses ksft_print_msg() with
very intermittent logging of errno and perror() (which won't produce KTAP
conformant output) when logging the result of API calls that set errno.
Standardise on using the ksft_perror() helper in these cases so that more
information is available should the tests fail.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Link: https://lore.kernel.org/r/20241029-arm64-mte-test-logging-v1-1-a128e732e36e@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../testing/selftests/arm64/mte/mte_common_util.c  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 17fbe5cfe472..a1dc2fe5285b 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -150,13 +150,13 @@ static void *__mte_allocate_memory_range(size_t size, int mem_type, int mapping,
 		map_flag |= MAP_PRIVATE;
 	ptr = mmap(NULL, entire_size, prot_flag, map_flag, fd, 0);
 	if (ptr == MAP_FAILED) {
-		ksft_print_msg("FAIL: mmap allocation\n");
+		ksft_perror("mmap()");
 		return NULL;
 	}
 	if (mem_type == USE_MPROTECT) {
 		if (mprotect(ptr, entire_size, prot_flag | PROT_MTE)) {
+			ksft_perror("mprotect(PROT_MTE)");
 			munmap(ptr, size);
-			ksft_print_msg("FAIL: mprotect PROT_MTE property\n");
 			return NULL;
 		}
 	}
@@ -190,13 +190,13 @@ void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, bool tags
 	lseek(fd, 0, SEEK_SET);
 	for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) {
 		if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) {
-			perror("initialising buffer");
+			ksft_perror("initialising buffer");
 			return NULL;
 		}
 	}
 	index -= INIT_BUFFER_SIZE;
 	if (write(fd, buffer, size - index) != size - index) {
-		perror("initialising buffer");
+		ksft_perror("initialising buffer");
 		return NULL;
 	}
 	return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, fd);
@@ -217,12 +217,12 @@ void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping,
 	lseek(fd, 0, SEEK_SET);
 	for (index = INIT_BUFFER_SIZE; index < map_size; index += INIT_BUFFER_SIZE)
 		if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) {
-			perror("initialising buffer");
+			ksft_perror("initialising buffer");
 			return NULL;
 		}
 	index -= INIT_BUFFER_SIZE;
 	if (write(fd, buffer, map_size - index) != map_size - index) {
-		perror("initialising buffer");
+		ksft_perror("initialising buffer");
 		return NULL;
 	}
 	return __mte_allocate_memory_range(size, mem_type, mapping, range_before,
@@ -358,7 +358,7 @@ int create_temp_file(void)
 	/* Create a file in the tmpfs filesystem */
 	fd = mkstemp(&filename[0]);
 	if (fd == -1) {
-		perror(filename);
+		ksft_perror(filename);
 		ksft_print_msg("FAIL: Unable to open temporary file\n");
 		return 0;
 	}

From 1caeda5ef2510e8af57b75edb74bd1daa6364c1e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 31 Oct 2024 19:21:38 +0000
Subject: [PATCH 119/168] arm64/gcs: Fix outdated ptrace documentation

The ptrace documentation for GCS was written prior to the implementation of
clone3() when we still blocked enabling of GCS via ptrace. This restriction
was relaxed as part of implementing clone3() support since we implemented
support for the GCS not being managed by the kernel but the documentation
still mentions the restriction. Update the documentation to reflect what
was merged.

We have not yet merged clone3() itself but all the support other than in
clone() itself is there.

Fixes: 7058bf87cd59 ("arm64/gcs: Document the ABI for Guarded Control Stacks")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241031-arm64-gcs-doc-disable-v1-1-d7f6ded62046@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/gcs.rst | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst
index af58d9151cb7..1f65a3193e77 100644
--- a/Documentation/arch/arm64/gcs.rst
+++ b/Documentation/arch/arm64/gcs.rst
@@ -204,11 +204,8 @@ When returning from a signal handler:
 * A new regset NT_ARM_GCS is defined for use with PTRACE_GETREGSET and
   PTRACE_SETREGSET.
 
-* Due to the complexity surrounding allocation and deallocation of stacks and
-  lack of practical application it is not possible to enable GCS via ptrace.
-  GCS may be disabled via the ptrace interface.
-
-* Other GCS modes may be configured via ptrace.
+* The GCS mode, including enable and disable, may be configured via ptrace.
+  If GCS is enabled via ptrace no new GCS will be allocated for the thread.
 
 * Configuration via ptrace ignores locking of GCS mode bits.
 

From 2287a4c1e11822d05a70d22f28b26bd810dd204e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 31 Oct 2024 08:35:19 +0000
Subject: [PATCH 120/168] arm64: Expose ID_AA64ISAR1_EL1.XS to sanitised
 feature consumers

Despite KVM now being able to deal with XS-tagged TLBIs, we still don't
expose these feature bits to KVM.

Plumb in the feature in ID_AA64ISAR1_EL1.

Fixes: 0feec7769a63 ("KVM: arm64: nv: Add handling of NXS-flavoured TLBI operations")
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20241031083519.364313-1-maz@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 718728a85430..db994d1fd97e 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -228,6 +228,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0),

From 69c0d824779843b51ca2339b2163db4d3b40c54c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 28 Oct 2024 20:22:31 +0000
Subject: [PATCH 121/168] kselftest/arm64: Fix encoding for SVE B16B16 test

The test for SVE_B16B16 had a cut'n'paste of a SME instruction, fix it with
a relevant SVE instruction.

Fixes: 44d10c27bd75 ("kselftest/arm64: Add 2023 DPISA hwcap test coverage")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241028-arm64-b16b16-test-v1-1-59a4a7449bdf@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index 7e95ba5fd496..265654ec48b9 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -361,8 +361,8 @@ static void sveaes_sigill(void)
 
 static void sveb16b16_sigill(void)
 {
-	/* BFADD ZA.H[W0, 0], {Z0.H-Z1.H} */
-	asm volatile(".inst 0xC1E41C00" : : : );
+	/* BFADD Z0.H, Z0.H, Z0.H */
+	asm volatile(".inst 0x65000000" : : : );
 }
 
 static void svepmull_sigill(void)

From 525fd6a1b34ea8ce42e12db38380eed0efefb5d5 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 2 Nov 2024 10:31:54 +0100
Subject: [PATCH 122/168] arm64/fpsimd: Fix a typo

s/FPSMID/FPSIMD/
M and I swapped. Fix it.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/2cbcb42615e9265bccc9b746465d7998382e605d.1730539907.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/fpsimd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 77006df20a75..cd7d71fe1fda 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -386,7 +386,7 @@ static void task_fpsimd_load(void)
 			 * fpsimd_save_user_state() or memory corruption, we
 			 * should always record an explicit format
 			 * when we save. We always at least have the
-			 * memory allocated for FPSMID registers so
+			 * memory allocated for FPSIMD registers so
 			 * try that and hope for the best.
 			 */
 			WARN_ON_ONCE(1);

From 263e22d6bd1f8a12dc2e770cf190f8dfb31f867e Mon Sep 17 00:00:00 2001
From: Zheng Zengkai <zhengzengkai@huawei.com>
Date: Wed, 16 Oct 2024 17:54:58 +0800
Subject: [PATCH 123/168] ACPI: GTDT: Tighten the check for the array of
 platform timer structures

As suggested by Marc and Lorenzo, first we need to check whether the
platform_timer entry pointer is within gtdt bounds (< gtdt_end) before
de-referencing what it points at to detect the length of the platform
timer struct and then check that the length of current platform_timer
struct is also valid, i.e. the length is not zero and within gtdt_end.
Now next_platform_timer() only checks against gtdt_end for the entry of
subsequent platform timer without checking the length of it and will
not report error if the check failed and the existing check in function
acpi_gtdt_init() is also not enough.

Modify the for_each_platform_timer() iterator and use it combined with
a dedicated check function platform_timer_valid() to do the check
against table length (gtdt_end) for each element of platform timer
array in function acpi_gtdt_init(), making sure that both their entry
and length actually fit in the table.

Suggested-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Co-developed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
Reviewed-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Link: https://lore.kernel.org/r/20241016095458.34126-1-zhengzengkai@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/gtdt.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index c0e77c1c8e09..d7c4e1b9915b 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -36,19 +36,25 @@ struct acpi_gtdt_descriptor {
 
 static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
 
-static inline __init void *next_platform_timer(void *platform_timer)
+static __init bool platform_timer_valid(void *platform_timer)
 {
 	struct acpi_gtdt_header *gh = platform_timer;
 
-	platform_timer += gh->length;
-	if (platform_timer < acpi_gtdt_desc.gtdt_end)
-		return platform_timer;
+	return (platform_timer >= (void *)(acpi_gtdt_desc.gtdt + 1) &&
+		platform_timer < acpi_gtdt_desc.gtdt_end &&
+		gh->length != 0 &&
+		platform_timer + gh->length <= acpi_gtdt_desc.gtdt_end);
+}
 
-	return NULL;
+static __init void *next_platform_timer(void *platform_timer)
+{
+	struct acpi_gtdt_header *gh = platform_timer;
+
+	return platform_timer + gh->length;
 }
 
 #define for_each_platform_timer(_g)				\
-	for (_g = acpi_gtdt_desc.platform_timer; _g;	\
+	for (_g = acpi_gtdt_desc.platform_timer; platform_timer_valid(_g);\
 	     _g = next_platform_timer(_g))
 
 static inline bool is_timer_block(void *platform_timer)
@@ -157,6 +163,7 @@ int __init acpi_gtdt_init(struct acpi_table_header *table,
 {
 	void *platform_timer;
 	struct acpi_table_gtdt *gtdt;
+	int cnt = 0;
 
 	gtdt = container_of(table, struct acpi_table_gtdt, header);
 	acpi_gtdt_desc.gtdt = gtdt;
@@ -176,12 +183,16 @@ int __init acpi_gtdt_init(struct acpi_table_header *table,
 		return 0;
 	}
 
-	platform_timer = (void *)gtdt + gtdt->platform_timer_offset;
-	if (platform_timer < (void *)table + sizeof(struct acpi_table_gtdt)) {
+	acpi_gtdt_desc.platform_timer = (void *)gtdt + gtdt->platform_timer_offset;
+	for_each_platform_timer(platform_timer)
+		cnt++;
+
+	if (cnt != gtdt->platform_timer_count) {
+		acpi_gtdt_desc.platform_timer = NULL;
 		pr_err(FW_BUG "invalid timer data.\n");
 		return -EINVAL;
 	}
-	acpi_gtdt_desc.platform_timer = platform_timer;
+
 	if (platform_timer_count)
 		*platform_timer_count = gtdt->platform_timer_count;
 

From ced841702ee7013ef08159a2cb5cadf0e0820b13 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 4 Nov 2024 09:46:17 +0530
Subject: [PATCH 124/168] arm64/mm: Drop setting PTE_TYPE_PAGE in pte_mkcont()

PTE_TYPE_PAGE bits were being set in pte_mkcont() because PTE_TABLE_BIT
was being cleared in pte_mkhuge(). But after arch_make_huge_pte()
modification in commit f8192813dcbe ("arm64/mm: Re-organize
arch_make_huge_pte()"), which dropped pte_mkhuge() completely, setting
back PTE_TYPE_PAGE bits is no longer necessary. Change pte_mkcont() to
only set PTE_CONT.

Cc: Will Deacon <will@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20241104041617.3804617-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 609fd4447b19..0569b2308fd1 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -265,8 +265,7 @@ static inline pte_t pte_mkspecial(pte_t pte)
 
 static inline pte_t pte_mkcont(pte_t pte)
 {
-	pte = set_pte_bit(pte, __pgprot(PTE_CONT));
-	return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE));
+	return set_pte_bit(pte, __pgprot(PTE_CONT));
 }
 
 static inline pte_t pte_mknoncont(pte_t pte)

From 466ece4c6e195263ccab2d402ee06b2f8d639a0e Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 29 Oct 2024 14:45:36 +0000
Subject: [PATCH 125/168] arm64: signal: Remove unnecessary check when saving
 POE state

The POE frame record is allocated unconditionally if POE is
supported. If the allocation fails, a SIGSEGV is delivered before
setup_sigframe() can be reached. As a result there is no need to
consider poe_offset before saving POR_EL0; just remove that check.
This is in line with other frame records (FPMR, TPIDR2).

Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Link: https://lore.kernel.org/r/20241029144539.111155-3-kevin.brodsky@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index c7d311d8b92a..d5eb517cc4df 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -1154,7 +1154,7 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
 		err |= preserve_fpmr_context(fpmr_ctx);
 	}
 
-	if (system_supports_poe() && err == 0 && user->poe_offset) {
+	if (system_supports_poe() && err == 0) {
 		struct poe_context __user *poe_ctx =
 			apply_user_offset(user, user->poe_offset);
 

From 8edbbfcc1ed3ca2170a2c5e888a76ba3652e62c9 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 29 Oct 2024 14:45:37 +0000
Subject: [PATCH 126/168] arm64: signal: Remove unused macro

Commit 33f082614c34 ("arm64: signal: Allow expansion of the signal
frame") introduced the BASE_SIGFRAME_SIZE macro but it has
apparently never been used; just remove it.

Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Link: https://lore.kernel.org/r/20241029144539.111155-4-kevin.brodsky@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/signal.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index d5eb517cc4df..b077181a66c0 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -79,7 +79,6 @@ struct user_access_state {
 	u64 por_el0;
 };
 
-#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16)
 #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16)
 #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16)
 

From 6e182dc9f2680681ffb0b6d9757927f1bd321b38 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 29 Oct 2024 14:45:38 +0000
Subject: [PATCH 127/168] selftests/mm: Use generic pkey register manipulation

pkey_sighandler_tests.c currently hardcodes x86 PKRU encodings. The
first step towards running those tests on arm64 is to abstract away
the pkey register values.

Since those tests want to deny access to all keys except a few,
we have each arch define PKEY_REG_ALLOW_NONE, the pkey register value
denying access to all keys. We then use the existing set_pkey_bits()
helper to grant access to specific keys.

Because pkeys may also remove the execute permission on arm64, we
need to be a little careful: all code is mapped with pkey 0, and we
need it to remain executable. pkey_reg_restrictive_default() is
introduced for that purpose: the value it returns prevents RW access
to all pkeys, but retains X permission for pkey 0.

test_pkru_preserved_after_sigusr1() only checks that the pkey
register value remains unchanged after a signal is delivered, so the
particular value is irrelevant. We enable pkey 0 and a few more
arbitrary keys in the smallest range available on all architectures
(8 keys on arm64).

Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/r/20241029144539.111155-5-kevin.brodsky@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/mm/pkey-arm64.h       |  1 +
 tools/testing/selftests/mm/pkey-x86.h         |  2 +
 .../selftests/mm/pkey_sighandler_tests.c      | 53 +++++++++++++++----
 3 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
index 580e1b0bb38e..d57fbeace38f 100644
--- a/tools/testing/selftests/mm/pkey-arm64.h
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -31,6 +31,7 @@
 #define NR_RESERVED_PKEYS	1 /* pkey-0 */
 
 #define PKEY_ALLOW_ALL		0x77777777
+#define PKEY_REG_ALLOW_NONE	0x0
 
 #define PKEY_BITS_PER_PKEY	4
 #define PAGE_SIZE		sysconf(_SC_PAGESIZE)
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
index 5f28e26a2511..ac91777c8917 100644
--- a/tools/testing/selftests/mm/pkey-x86.h
+++ b/tools/testing/selftests/mm/pkey-x86.h
@@ -34,6 +34,8 @@
 #define PAGE_SIZE		4096
 #define MB			(1<<20)
 
+#define PKEY_REG_ALLOW_NONE	0x55555555
+
 static inline void __page_o_noops(void)
 {
 	/* 8-bytes of instruction * 512 bytes = 1 page */
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index a8088b645ad6..501880dbdc37 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -11,6 +11,7 @@
  */
 #define _GNU_SOURCE
 #define __SANE_USERSPACE_TYPES__
+#include <linux/mman.h>
 #include <errno.h>
 #include <sys/syscall.h>
 #include <string.h>
@@ -65,6 +66,20 @@ long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
 	return ret;
 }
 
+/*
+ * Returns the most restrictive pkey register value that can be used by the
+ * tests.
+ */
+static inline u64 pkey_reg_restrictive_default(void)
+{
+	/*
+	 * Disallow everything except execution on pkey 0, so that each caller
+	 * doesn't need to enable it explicitly (the selftest code runs with
+	 * its code mapped with pkey 0).
+	 */
+	return set_pkey_bits(PKEY_REG_ALLOW_NONE, 0, PKEY_DISABLE_ACCESS);
+}
+
 static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext)
 {
 	pthread_mutex_lock(&mutex);
@@ -113,7 +128,7 @@ static void raise_sigusr2(void)
 static void *thread_segv_with_pkey0_disabled(void *ptr)
 {
 	/* Disable MPK 0 (and all others too) */
-	__write_pkey_reg(0x55555555);
+	__write_pkey_reg(pkey_reg_restrictive_default());
 
 	/* Segfault (with SEGV_MAPERR) */
 	*(int *) (0x1) = 1;
@@ -123,7 +138,7 @@ static void *thread_segv_with_pkey0_disabled(void *ptr)
 static void *thread_segv_pkuerr_stack(void *ptr)
 {
 	/* Disable MPK 0 (and all others too) */
-	__write_pkey_reg(0x55555555);
+	__write_pkey_reg(pkey_reg_restrictive_default());
 
 	/* After we disable MPK 0, we can't access the stack to return */
 	return NULL;
@@ -133,6 +148,7 @@ static void *thread_segv_maperr_ptr(void *ptr)
 {
 	stack_t *stack = ptr;
 	int *bad = (int *)1;
+	u64 pkey_reg;
 
 	/*
 	 * Setup alternate signal stack, which should be pkey_mprotect()ed by
@@ -142,7 +158,9 @@ static void *thread_segv_maperr_ptr(void *ptr)
 	syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
 
 	/* Disable MPK 0.  Only MPK 1 is enabled. */
-	__write_pkey_reg(0x55555551);
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
+	__write_pkey_reg(pkey_reg);
 
 	/* Segfault */
 	*bad = 1;
@@ -240,6 +258,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 	int pkey;
 	int parent_pid = 0;
 	int child_pid = 0;
+	u64 pkey_reg;
 
 	sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
 
@@ -257,7 +276,10 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 	assert(stack != MAP_FAILED);
 
 	/* Allow access to MPK 0 and MPK 1 */
-	__write_pkey_reg(0x55555550);
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+	pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
+	__write_pkey_reg(pkey_reg);
 
 	/* Protect the new stack with MPK 1 */
 	pkey = pkey_alloc(0, 0);
@@ -307,7 +329,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 static void test_pkru_preserved_after_sigusr1(void)
 {
 	struct sigaction sa;
-	unsigned long pkru = 0x45454544;
+	u64 pkey_reg;
+
+	/* Allow access to MPK 0 and an arbitrary set of keys */
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+	pkey_reg = set_pkey_bits(pkey_reg, 3, PKEY_UNRESTRICTED);
+	pkey_reg = set_pkey_bits(pkey_reg, 7, PKEY_UNRESTRICTED);
 
 	sa.sa_flags = SA_SIGINFO;
 
@@ -320,7 +348,7 @@ static void test_pkru_preserved_after_sigusr1(void)
 
 	memset(&siginfo, 0, sizeof(siginfo));
 
-	__write_pkey_reg(pkru);
+	__write_pkey_reg(pkey_reg);
 
 	raise(SIGUSR1);
 
@@ -330,7 +358,7 @@ static void test_pkru_preserved_after_sigusr1(void)
 	pthread_mutex_unlock(&mutex);
 
 	/* Ensure the pkru value is the same after returning from signal. */
-	ksft_test_result(pkru == __read_pkey_reg() &&
+	ksft_test_result(pkey_reg == __read_pkey_reg() &&
 			 siginfo.si_signo == SIGUSR1,
 			 "%s\n", __func__);
 }
@@ -347,6 +375,7 @@ static noinline void *thread_sigusr2_self(void *ptr)
 		'S', 'I', 'G', 'U', 'S', 'R', '2',
 		'.', '.', '.', '\n', '\0'};
 	stack_t *stack = ptr;
+	u64 pkey_reg;
 
 	/*
 	 * Setup alternate signal stack, which should be pkey_mprotect()ed by
@@ -356,7 +385,9 @@ static noinline void *thread_sigusr2_self(void *ptr)
 	syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
 
 	/* Disable MPK 0.  Only MPK 2 is enabled. */
-	__write_pkey_reg(0x55555545);
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
+	__write_pkey_reg(pkey_reg);
 
 	raise_sigusr2();
 
@@ -384,6 +415,7 @@ static void test_pkru_sigreturn(void)
 	int pkey;
 	int parent_pid = 0;
 	int child_pid = 0;
+	u64 pkey_reg;
 
 	sa.sa_handler = SIG_DFL;
 	sa.sa_flags = 0;
@@ -418,7 +450,10 @@ static void test_pkru_sigreturn(void)
 	 * the current thread's stack is protected by the default MPK 0. Hence
 	 * both need to be enabled.
 	 */
-	__write_pkey_reg(0x55555544);
+	pkey_reg = pkey_reg_restrictive_default();
+	pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+	pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
+	__write_pkey_reg(pkey_reg);
 
 	/* Protect the stack with MPK 2 */
 	pkey = pkey_alloc(0, 0);

From 49f59573e9e06093ba23caf4ea1641b16e7e497e Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Tue, 29 Oct 2024 14:45:39 +0000
Subject: [PATCH 128/168] selftests/mm: Enable pkey_sighandler_tests on arm64

pkey_sighandler_tests.c makes raw syscalls using its own helper,
syscall_raw(). One of those syscalls is clone, which is problematic
as every architecture has a different opinion on the order of its
arguments.

To complete arm64 support, we therefore add an appropriate
implementation in syscall_raw(), and introduce a clone_raw() helper
that shuffles arguments as needed for each arch.

Having done this, we enable building pkey_sighandler_tests for arm64
in the Makefile.

Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Link: https://lore.kernel.org/r/20241029144539.111155-6-kevin.brodsky@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/mm/Makefile           |  8 +--
 .../selftests/mm/pkey_sighandler_tests.c      | 62 ++++++++++++++-----
 2 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 02e1204971b0..0f8c110e0805 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -105,12 +105,12 @@ endif
 ifeq ($(CAN_BUILD_X86_64),1)
 TEST_GEN_FILES += $(BINARIES_64)
 endif
-else
 
-ifneq (,$(filter $(ARCH),arm64 powerpc))
+else ifeq ($(ARCH),arm64)
+TEST_GEN_FILES += protection_keys
+TEST_GEN_FILES += pkey_sighandler_tests
+else ifeq ($(ARCH),powerpc)
 TEST_GEN_FILES += protection_keys
-endif
-
 endif
 
 ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index 501880dbdc37..c593a426341c 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -60,12 +60,44 @@ long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
 		      : "=a"(ret)
 		      : "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5)
 		      : "memory");
+#elif defined __aarch64__
+	register long x0 asm("x0") = a1;
+	register long x1 asm("x1") = a2;
+	register long x2 asm("x2") = a3;
+	register long x3 asm("x3") = a4;
+	register long x4 asm("x4") = a5;
+	register long x5 asm("x5") = a6;
+	register long x8 asm("x8") = n;
+	asm volatile ("svc #0"
+		      : "=r"(x0)
+		      : "r"(x0), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(x8)
+		      : "memory");
+	ret = x0;
 #else
 # error syscall_raw() not implemented
 #endif
 	return ret;
 }
 
+static inline long clone_raw(unsigned long flags, void *stack,
+			     int *parent_tid, int *child_tid)
+{
+	long a1 = flags;
+	long a2 = (long)stack;
+	long a3 = (long)parent_tid;
+#if defined(__x86_64__) || defined(__i386)
+	long a4 = (long)child_tid;
+	long a5 = 0;
+#elif defined(__aarch64__)
+	long a4 = 0;
+	long a5 = (long)child_tid;
+#else
+# error clone_raw() not implemented
+#endif
+
+	return syscall_raw(SYS_clone, a1, a2, a3, a4, a5, 0);
+}
+
 /*
  * Returns the most restrictive pkey register value that can be used by the
  * tests.
@@ -294,14 +326,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
 	memset(&siginfo, 0, sizeof(siginfo));
 
 	/* Use clone to avoid newer glibcs using rseq on new threads */
-	long ret = syscall_raw(SYS_clone,
-			       CLONE_VM | CLONE_FS | CLONE_FILES |
-			       CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
-			       CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
-			       CLONE_DETACHED,
-			       (long) ((char *)(stack) + STACK_SIZE),
-			       (long) &parent_pid,
-			       (long) &child_pid, 0, 0);
+	long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
+			     CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+			     CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+			     CLONE_DETACHED,
+			     stack + STACK_SIZE,
+			     &parent_pid,
+			     &child_pid);
 
 	if (ret < 0) {
 		errno = -ret;
@@ -466,14 +497,13 @@ static void test_pkru_sigreturn(void)
 	sigstack.ss_size = STACK_SIZE;
 
 	/* Use clone to avoid newer glibcs using rseq on new threads */
-	long ret = syscall_raw(SYS_clone,
-			       CLONE_VM | CLONE_FS | CLONE_FILES |
-			       CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
-			       CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
-			       CLONE_DETACHED,
-			       (long) ((char *)(stack) + STACK_SIZE),
-			       (long) &parent_pid,
-			       (long) &child_pid, 0, 0);
+	long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
+			     CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+			     CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+			     CLONE_DETACHED,
+			     stack + STACK_SIZE,
+			     &parent_pid,
+			     &child_pid);
 
 	if (ret < 0) {
 		errno = -ret;

From aa47dcda2708e571695dae2e3f9537d9a8eb804c Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Sat, 2 Nov 2024 18:42:31 +0800
Subject: [PATCH 129/168] arm64/sysreg: Update ID_AA64MMFR1_EL1 register

Update ID_AA64MMFR1_EL1 register fields definition per DDI0601 (ID092424)
2024-09. ID_AA64MMFR1_EL1.ETS adds definition for FEAT_ETS2 and
FEAT_ETS3. ID_AA64MMFR1_EL1.HAFDBS adds definition for FEAT_HAFT and
FEAT_HDBSS.

Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20241102104235.62560-2-yangyicong@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/tools/sysreg | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 8d637ac4b7c6..ae64ba810298 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1648,6 +1648,8 @@ EndEnum
 UnsignedEnum	39:36	ETS
 	0b0000	NI
 	0b0001	IMP
+	0b0010	ETS2
+	0b0011	ETS3
 EndEnum
 UnsignedEnum	35:32	TWED
 	0b0000	NI
@@ -1688,6 +1690,8 @@ UnsignedEnum	3:0	HAFDBS
 	0b0000	NI
 	0b0001	AF
 	0b0010	DBM
+	0b0011	HAFT
+	0b0100	HDBSS
 EndEnum
 EndSysreg
 

From 926b66e2ebc8c055b9fea3fb3e5f5b67c80e8e7a Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Sat, 2 Nov 2024 18:42:32 +0800
Subject: [PATCH 130/168] arm64: setup: name 'tcr2' register

TCR2_EL1 introduced some additional controls besides TCR_EL1. Currently
only PIE is supported and enabled by writing TCR2_EL1 directly if PIE
detected.

Introduce a named register 'tcr2' just like 'tcr' we've already had.
It'll be initialized to 0 and updated if certain feature detected and
needs to be enabled. Touch the TCR2_EL1 registers at last with the
updated 'tcr2' value if FEAT_TCR2 supported by checking
ID_AA64MMFR3_EL1.TCRX. Then we can extend the support of other features
controlled by TCR2_EL1.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20241102104235.62560-3-yangyicong@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/proc.S | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 8abdc7fed321..ccbae4525891 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -465,10 +465,12 @@ SYM_FUNC_START(__cpu_setup)
 	 */
 	mair	.req	x17
 	tcr	.req	x16
+	tcr2	.req	x15
 	mov_q	mair, MAIR_EL1_SET
 	mov_q	tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \
 		     TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
 		     TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
+	mov	tcr2, xzr
 
 	tcr_clear_errata_bits tcr, x9, x5
 
@@ -525,11 +527,16 @@ alternative_else_nop_endif
 #undef PTE_MAYBE_NG
 #undef PTE_MAYBE_SHARED
 
-	mov	x0, TCR2_EL1x_PIE
-	msr	REG_TCR2_EL1, x0
+	orr	tcr2, tcr2, TCR2_EL1x_PIE
 
 .Lskip_indirection:
 
+	mrs_s	x1, SYS_ID_AA64MMFR3_EL1
+	ubfx	x1, x1, #ID_AA64MMFR3_EL1_TCRX_SHIFT, #4
+	cbz	x1, 1f
+	msr	REG_TCR2_EL1, tcr2
+1:
+
 	/*
 	 * Prepare SCTLR
 	 */
@@ -538,4 +545,5 @@ alternative_else_nop_endif
 
 	.unreq	mair
 	.unreq	tcr
+	.unreq	tcr2
 SYM_FUNC_END(__cpu_setup)

From baec2397971960e8d5661c616ce20d6a24d7dc4f Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 5 Nov 2024 10:39:20 +0100
Subject: [PATCH 131/168] arm64/mm: Sanity check PTE address before runtime
 P4D/PUD folding

The runtime P4D/PUD folding logic assumes that the respective pgd_t* and
p4d_t* arguments are pointers into actual page tables that are part of
the hierarchy being operated on.

This may not always be the case, and we have been bitten once by this
already [0], where the argument was actually a stack variable, and in
this case, the logic does not work at all.

So let's add a VM_BUG_ON() for each case, to ensure that the address of
the provided page table entry is consistent with the address being
translated.

[0] https://lore.kernel.org/all/20240725090345.28461-1-will@kernel.org/T/#u

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20241105093919.1312049-2-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0569b2308fd1..073e25f73445 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -921,6 +921,9 @@ static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
 
 static inline pud_t *p4d_to_folded_pud(p4d_t *p4dp, unsigned long addr)
 {
+	/* Ensure that 'p4dp' indexes a page table according to 'addr' */
+	VM_BUG_ON(((addr >> P4D_SHIFT) ^ ((u64)p4dp >> 3)) % PTRS_PER_P4D);
+
 	return (pud_t *)PTR_ALIGN_DOWN(p4dp, PAGE_SIZE) + pud_index(addr);
 }
 
@@ -1045,6 +1048,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
 
 static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr)
 {
+	/* Ensure that 'pgdp' indexes a page table according to 'addr' */
+	VM_BUG_ON(((addr >> PGDIR_SHIFT) ^ ((u64)pgdp >> 3)) % PTRS_PER_PGD);
+
 	return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr);
 }
 

From dc9b74a76320dd87335956f27f09791fa922ea9b Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 5 Nov 2024 10:11:54 +0530
Subject: [PATCH 132/168] arm64/ptdump: Test both PTE_TABLE_BIT and PTE_VALID
 for block mappings

Test both PTE_TABLE_BIT and PTE_VALID for block mappings, similar to KVM S2
ptdump. This ensures consistency in identifying block mappings, both in the
S1 and the S2 page tables. Besides being kernel page tables, there will not
be any unmapped (!PTE_VALID) block mappings.

Cc: Will Deacon <will@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20241105044154.4064181-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/ptdump.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 264c5f9b97d8..688fbe0271ca 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -80,10 +80,10 @@ static const struct ptdump_prot_bits pte_bits[] = {
 		.set	= "CON",
 		.clear	= "   ",
 	}, {
-		.mask	= PTE_TABLE_BIT,
-		.val	= PTE_TABLE_BIT,
-		.set	= "   ",
-		.clear	= "BLK",
+		.mask	= PTE_TABLE_BIT | PTE_VALID,
+		.val	= PTE_VALID,
+		.set	= "BLK",
+		.clear	= "   ",
 	}, {
 		.mask	= PTE_UXN,
 		.val	= PTE_UXN,

From efe72541355d4d40a4f076af453f6533e98e058c Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Sat, 2 Nov 2024 18:42:33 +0800
Subject: [PATCH 133/168] arm64: Add support for FEAT_HAFT

Armv8.9/v9.4 introduces the feature Hardware managed Access Flag
for Table descriptors (FEAT_HAFT). The feature is indicated by
ID_AA64MMFR1_EL1.HAFDBS == 0b0011 and can be enabled by
TCR2_EL1.HAFT so it has a dependency on FEAT_TCR2.

Adds the Kconfig for FEAT_HAFT and support detecting and enabling
the feature. The feature is enabled in __cpu_setup() before MMU on
just like HA. A CPU capability is added to notify the user of the
feature.

Add definition of P{G,4,U,M}D_TABLE_AF bit and set the AF bit
when creating the page table, which will save the hardware
from having to update them at runtime. This will be ignored if
FEAT_HAFT is not enabled.

The AF bit of table descriptors cannot be managed by the software
per spec, unlike the HA. So this should be used only if it's supported
system wide by system_supports_haft().

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20241102104235.62560-4-yangyicong@huawei.com
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
[catalin.marinas@arm.com: added the ID check back to __cpu_setup in case of future CPU errata]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig                     | 15 +++++++++++++++
 arch/arm64/include/asm/cpufeature.h    |  6 ++++++
 arch/arm64/include/asm/pgalloc.h       | 12 +++++++-----
 arch/arm64/include/asm/pgtable-hwdef.h |  4 ++++
 arch/arm64/kernel/cpufeature.c         | 15 +++++++++++++++
 arch/arm64/mm/fixmap.c                 |  9 ++++++---
 arch/arm64/mm/mmu.c                    |  8 ++++----
 arch/arm64/mm/proc.S                   |  7 ++++++-
 arch/arm64/tools/cpucaps               |  1 +
 9 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e29b44d2d7b..b90f92f89d54 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2176,6 +2176,21 @@ config ARCH_PKEY_BITS
 	int
 	default 3
 
+config ARM64_HAFT
+	bool "Support for Hardware managed Access Flag for Table Descriptors"
+	depends on ARM64_HW_AFDBM
+	default y
+	help
+	  The ARMv8.9/ARMv9.5 introduces the feature Hardware managed Access
+	  Flag for Table descriptors. When enabled an architectural executed
+	  memory access will update the Access Flag in each Table descriptor
+	  which is accessed during the translation table walk and for which
+	  the Access Flag is 0. The Access Flag of the Table descriptor use
+	  the same bit of PTE_AF.
+
+	  The feature will only be enabled if all the CPUs in the system
+	  support this feature. If unsure, say Y.
+
 endmenu # "ARMv8.9 architectural features"
 
 config ARM64_SVE
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 3d261cc123c1..ed8c784ca082 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -838,6 +838,12 @@ static inline bool system_supports_poe(void)
 		alternative_has_cap_unlikely(ARM64_HAS_S1POE);
 }
 
+static inline bool system_supports_haft(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_HAFT) &&
+		cpus_have_final_cap(ARM64_HAFT);
+}
+
 int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
 bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
 
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 8ff5f2a2579e..e75422864d1b 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -28,7 +28,7 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
 {
-	pudval_t pudval = PUD_TYPE_TABLE;
+	pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_AF;
 
 	pudval |= (mm == &init_mm) ? PUD_TABLE_UXN : PUD_TABLE_PXN;
 	__pud_populate(pudp, __pa(pmdp), pudval);
@@ -50,7 +50,7 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
 
 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
 {
-	p4dval_t p4dval = P4D_TYPE_TABLE;
+	p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_AF;
 
 	p4dval |= (mm == &init_mm) ? P4D_TABLE_UXN : P4D_TABLE_PXN;
 	__p4d_populate(p4dp, __pa(pudp), p4dval);
@@ -79,7 +79,7 @@ static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
 
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
 {
-	pgdval_t pgdval = PGD_TYPE_TABLE;
+	pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_AF;
 
 	pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
 	__pgd_populate(pgdp, __pa(p4dp), pgdval);
@@ -127,14 +127,16 @@ static inline void
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
 {
 	VM_BUG_ON(mm && mm != &init_mm);
-	__pmd_populate(pmdp, __pa(ptep), PMD_TYPE_TABLE | PMD_TABLE_UXN);
+	__pmd_populate(pmdp, __pa(ptep),
+		       PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_UXN);
 }
 
 static inline void
 pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
 {
 	VM_BUG_ON(mm == &init_mm);
-	__pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN);
+	__pmd_populate(pmdp, page_to_phys(ptep),
+		       PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_PXN);
 }
 
 #endif
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index fd330c1db289..c78a988cca93 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -99,6 +99,7 @@
 #define PGD_TYPE_TABLE		(_AT(pgdval_t, 3) << 0)
 #define PGD_TABLE_BIT		(_AT(pgdval_t, 1) << 1)
 #define PGD_TYPE_MASK		(_AT(pgdval_t, 3) << 0)
+#define PGD_TABLE_AF		(_AT(pgdval_t, 1) << 10)	/* Ignored if no FEAT_HAFT */
 #define PGD_TABLE_PXN		(_AT(pgdval_t, 1) << 59)
 #define PGD_TABLE_UXN		(_AT(pgdval_t, 1) << 60)
 
@@ -110,6 +111,7 @@
 #define P4D_TYPE_MASK		(_AT(p4dval_t, 3) << 0)
 #define P4D_TYPE_SECT		(_AT(p4dval_t, 1) << 0)
 #define P4D_SECT_RDONLY		(_AT(p4dval_t, 1) << 7)		/* AP[2] */
+#define P4D_TABLE_AF		(_AT(p4dval_t, 1) << 10)	/* Ignored if no FEAT_HAFT */
 #define P4D_TABLE_PXN		(_AT(p4dval_t, 1) << 59)
 #define P4D_TABLE_UXN		(_AT(p4dval_t, 1) << 60)
 
@@ -121,6 +123,7 @@
 #define PUD_TYPE_MASK		(_AT(pudval_t, 3) << 0)
 #define PUD_TYPE_SECT		(_AT(pudval_t, 1) << 0)
 #define PUD_SECT_RDONLY		(_AT(pudval_t, 1) << 7)		/* AP[2] */
+#define PUD_TABLE_AF		(_AT(pudval_t, 1) << 10)	/* Ignored if no FEAT_HAFT */
 #define PUD_TABLE_PXN		(_AT(pudval_t, 1) << 59)
 #define PUD_TABLE_UXN		(_AT(pudval_t, 1) << 60)
 
@@ -131,6 +134,7 @@
 #define PMD_TYPE_TABLE		(_AT(pmdval_t, 3) << 0)
 #define PMD_TYPE_SECT		(_AT(pmdval_t, 1) << 0)
 #define PMD_TABLE_BIT		(_AT(pmdval_t, 1) << 1)
+#define PMD_TABLE_AF		(_AT(pmdval_t, 1) << 10)	/* Ignored if no FEAT_HAFT */
 
 /*
  * Section
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 718728a85430..878712fa0d3b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2590,6 +2590,21 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.cpus = &dbm_cpus,
 		ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM)
 	},
+#endif
+#ifdef CONFIG_ARM64_HAFT
+	{
+		.desc = "Hardware managed Access Flag for Table Descriptors",
+		/*
+		 * Contrary to the page/block access flag, the table access flag
+		 * cannot be emulated in software (no access fault will occur).
+		 * Therefore this should be used only if it's supported system
+		 * wide.
+		 */
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.capability = ARM64_HAFT,
+		.matches = has_cpuid_feature,
+		ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT)
+	},
 #endif
 	{
 		.desc = "CRC32 instructions",
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index de1e09d986ad..c5c5425791da 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -47,7 +47,8 @@ static void __init early_fixmap_init_pte(pmd_t *pmdp, unsigned long addr)
 
 	if (pmd_none(pmd)) {
 		ptep = bm_pte[BM_PTE_TABLE_IDX(addr)];
-		__pmd_populate(pmdp, __pa_symbol(ptep), PMD_TYPE_TABLE);
+		__pmd_populate(pmdp, __pa_symbol(ptep),
+			       PMD_TYPE_TABLE | PMD_TABLE_AF);
 	}
 }
 
@@ -59,7 +60,8 @@ static void __init early_fixmap_init_pmd(pud_t *pudp, unsigned long addr,
 	pmd_t *pmdp;
 
 	if (pud_none(pud))
-		__pud_populate(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE);
+		__pud_populate(pudp, __pa_symbol(bm_pmd),
+			       PUD_TYPE_TABLE | PUD_TABLE_AF);
 
 	pmdp = pmd_offset_kimg(pudp, addr);
 	do {
@@ -86,7 +88,8 @@ static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr,
 	}
 
 	if (p4d_none(p4d))
-		__p4d_populate(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE);
+		__p4d_populate(p4dp, __pa_symbol(bm_pud),
+			       P4D_TYPE_TABLE | P4D_TABLE_AF);
 
 	pudp = pud_offset_kimg(p4dp, addr);
 	early_fixmap_init_pmd(pudp, addr, end);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e55b02fbddc8..6441a45eaeda 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -201,7 +201,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 
 	BUG_ON(pmd_sect(pmd));
 	if (pmd_none(pmd)) {
-		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
+		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
 		phys_addr_t pte_phys;
 
 		if (flags & NO_EXEC_MAPPINGS)
@@ -288,7 +288,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 	 */
 	BUG_ON(pud_sect(pud));
 	if (pud_none(pud)) {
-		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
+		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
 		phys_addr_t pmd_phys;
 
 		if (flags & NO_EXEC_MAPPINGS)
@@ -333,7 +333,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 	pud_t *pudp;
 
 	if (p4d_none(p4d)) {
-		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
+		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
 		phys_addr_t pud_phys;
 
 		if (flags & NO_EXEC_MAPPINGS)
@@ -391,7 +391,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 	p4d_t *p4dp;
 
 	if (pgd_none(pgd)) {
-		pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN;
+		pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF;
 		phys_addr_t p4d_phys;
 
 		if (flags & NO_EXEC_MAPPINGS)
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index ccbae4525891..b8edc5765441 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -495,9 +495,14 @@ alternative_else_nop_endif
 	 * via capabilities.
 	 */
 	mrs	x9, ID_AA64MMFR1_EL1
-	and	x9, x9, ID_AA64MMFR1_EL1_HAFDBS_MASK
+	ubfx	x9, x9, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, #4
 	cbz	x9, 1f
 	orr	tcr, tcr, #TCR_HA		// hardware Access flag update
+#ifdef CONFIG_ARM64_HAFT
+	cmp	x9, ID_AA64MMFR1_EL1_HAFDBS_HAFT
+	b.lt	1f
+	orr	tcr2, tcr2, TCR2_EL1x_HAFT
+#endif /* CONFIG_ARM64_HAFT */
 1:
 #endif	/* CONFIG_ARM64_HW_AFDBM */
 	msr	mair_el1, mair
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index eedb5acc21ed..b35004fa8313 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -56,6 +56,7 @@ HAS_TLB_RANGE
 HAS_VA52
 HAS_VIRT_HOST_EXTN
 HAS_WFXT
+HAFT
 HW_DBM
 KVM_HVHE
 KVM_PROTECTED_MODE

From 62df5870ebf7cec96a51c9b9008daf167e22db14 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Sat, 2 Nov 2024 18:42:34 +0800
Subject: [PATCH 134/168] arm64: Enable ARCH_HAS_NONLEAF_PMD_YOUNG

With the support of FEAT_HAFT, the NONLEAF_PMD_YOUNG can be enabled
on arm64 since the hardware is capable of updating the AF flag for
PMD table descriptor. Since the AF bit of the table descriptor
shares the same bit position in block descriptors, we only need
to implement arch_has_hw_nonleaf_pmd_young() and select related
configs. The related pmd_young test/update operations keeps the
same with and already implemented for transparent page support.

Currently ARCH_HAS_NONLEAF_PMD_YOUNG is used to improve the
efficiency of lru-gen aging.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20241102104235.62560-5-yangyicong@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig               | 1 +
 arch/arm64/include/asm/pgtable.h | 8 ++++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b90f92f89d54..d2ca7b6c6f85 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -38,6 +38,7 @@ config ARM64
 	select ARCH_HAS_MEM_ENCRYPT
 	select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
 	select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+	select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT
 	select ARCH_HAS_PTE_DEVMAP
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_HW_PTE_YOUNG
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c329ea061dc9..5182feabd943 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1259,7 +1259,7 @@ static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
 	return young;
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
 #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
@@ -1267,7 +1267,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 {
 	return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
 
 static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address, pte_t *ptep)
@@ -1502,6 +1502,10 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
  */
 #define arch_has_hw_pte_young		cpu_has_hw_af
 
+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+#define arch_has_hw_nonleaf_pmd_young	system_supports_haft
+#endif
+
 /*
  * Experimentally, it's cheap to set the access flag in hardware and we
  * benefit from prefaulting mappings as 'old' to start with.

From b349a5a2b6e236b25095c6ff886b3451de5ea041 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Sat, 2 Nov 2024 18:42:35 +0800
Subject: [PATCH 135/168] arm64: pgtable: Warn unexpected
 pmdp_test_and_clear_young()

Young bit operation on PMD table entry is only supported if
FEAT_HAFT enabled system wide. Add a warning for notifying
the misbehaviour.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20241102104235.62560-6-yangyicong@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 5182feabd943..160ca503c99f 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1265,6 +1265,8 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
 					    pmd_t *pmdp)
 {
+	/* Operation applies to PMD table entry only if FEAT_HAFT is enabled */
+	VM_WARN_ON(pmd_table(READ_ONCE(*pmdp)) && !system_supports_haft());
 	return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */

From 845fd2cbedaf299ee61680a678b279dfeb6fe77c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@baylibre.com>
Date: Sun, 27 Oct 2024 19:03:14 +0100
Subject: [PATCH 136/168] perf: Switch back to struct platform_driver::remove()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After commit 0edb555a65d1 ("platform: Make platform_driver::remove()
return void") .remove() is (again) the right callback to implement for
platform drivers.

Convert all platform drivers below drivers/perf to use .remove(), with
the eventual goal to drop struct platform_driver::remove_new(). As
.remove() and .remove_new() have the same prototypes, conversion is done
by just changing the structure member name in the driver initializer.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Link: https://lore.kernel.org/r/20241027180313.410964-2-u.kleine-koenig@baylibre.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/alibaba_uncore_drw_pmu.c         | 2 +-
 drivers/perf/amlogic/meson_g12_ddr_pmu.c      | 2 +-
 drivers/perf/arm-cci.c                        | 2 +-
 drivers/perf/arm-ccn.c                        | 2 +-
 drivers/perf/arm-cmn.c                        | 2 +-
 drivers/perf/arm_cspmu/arm_cspmu.c            | 2 +-
 drivers/perf/arm_dmc620_pmu.c                 | 2 +-
 drivers/perf/arm_dsu_pmu.c                    | 2 +-
 drivers/perf/arm_smmuv3_pmu.c                 | 2 +-
 drivers/perf/arm_spe_pmu.c                    | 2 +-
 drivers/perf/fsl_imx8_ddr_perf.c              | 2 +-
 drivers/perf/fsl_imx9_ddr_perf.c              | 2 +-
 drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c  | 2 +-
 drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c | 2 +-
 drivers/perf/hisilicon/hisi_uncore_hha_pmu.c  | 2 +-
 drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c  | 2 +-
 drivers/perf/hisilicon/hisi_uncore_pa_pmu.c   | 2 +-
 drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c | 2 +-
 drivers/perf/marvell_cn10k_ddr_pmu.c          | 2 +-
 drivers/perf/marvell_cn10k_tad_pmu.c          | 2 +-
 drivers/perf/qcom_l2_pmu.c                    | 2 +-
 drivers/perf/thunderx2_pmu.c                  | 2 +-
 drivers/perf/xgene_pmu.c                      | 2 +-
 23 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
index c6ff1bc7d336..99a0ef9817e0 100644
--- a/drivers/perf/alibaba_uncore_drw_pmu.c
+++ b/drivers/perf/alibaba_uncore_drw_pmu.c
@@ -782,7 +782,7 @@ static struct platform_driver ali_drw_pmu_driver = {
 		   .acpi_match_table = ali_drw_acpi_match,
 		   },
 	.probe = ali_drw_pmu_probe,
-	.remove_new = ali_drw_pmu_remove,
+	.remove = ali_drw_pmu_remove,
 };
 
 static int __init ali_drw_pmu_init(void)
diff --git a/drivers/perf/amlogic/meson_g12_ddr_pmu.c b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
index 99cc791892bc..f33e9a456e85 100644
--- a/drivers/perf/amlogic/meson_g12_ddr_pmu.c
+++ b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
@@ -379,7 +379,7 @@ MODULE_DEVICE_TABLE(of, meson_ddr_pmu_dt_match);
 
 static struct platform_driver g12_ddr_pmu_driver = {
 	.probe = g12_ddr_pmu_probe,
-	.remove_new = g12_ddr_pmu_remove,
+	.remove = g12_ddr_pmu_remove,
 
 	.driver = {
 		.name = "meson-g12-ddr-pmu",
diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index c76bac668dea..1cc3214d6b6d 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -1705,7 +1705,7 @@ static struct platform_driver cci_pmu_driver = {
 		   .suppress_bind_attrs = true,
 		  },
 	.probe = cci_pmu_probe,
-	.remove_new = cci_pmu_remove,
+	.remove = cci_pmu_remove,
 };
 
 module_platform_driver(cci_pmu_driver);
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 5c66b9278862..d5fcea3d4328 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -1529,7 +1529,7 @@ static struct platform_driver arm_ccn_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = arm_ccn_probe,
-	.remove_new = arm_ccn_remove,
+	.remove = arm_ccn_remove,
 };
 
 static int __init arm_ccn_init(void)
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 397a46410f7c..49bd811c6fd6 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -2662,7 +2662,7 @@ static struct platform_driver arm_cmn_driver = {
 		.acpi_match_table = ACPI_PTR(arm_cmn_acpi_match),
 	},
 	.probe = arm_cmn_probe,
-	.remove_new = arm_cmn_remove,
+	.remove = arm_cmn_remove,
 };
 
 static int __init arm_cmn_init(void)
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 2158a5975c90..81e8b97e9353 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -1282,7 +1282,7 @@ static struct platform_driver arm_cspmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = arm_cspmu_device_probe,
-	.remove_new = arm_cspmu_device_remove,
+	.remove = arm_cspmu_device_remove,
 	.id_table = arm_cspmu_id,
 };
 
diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 7e5f1d4fca0f..619cf937602f 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -750,7 +750,7 @@ static struct platform_driver dmc620_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe	= dmc620_pmu_device_probe,
-	.remove_new = dmc620_pmu_device_remove,
+	.remove	= dmc620_pmu_device_remove,
 };
 
 static int __init dmc620_pmu_init(void)
diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index f2bd25a3470a..cb4fb59fe04b 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -787,7 +787,7 @@ static struct platform_driver dsu_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = dsu_pmu_device_probe,
-	.remove_new = dsu_pmu_device_remove,
+	.remove = dsu_pmu_device_remove,
 };
 
 static int dsu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index d5fa92ba8373..b1510f660c7a 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -996,7 +996,7 @@ static struct platform_driver smmu_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = smmu_pmu_probe,
-	.remove_new = smmu_pmu_remove,
+	.remove = smmu_pmu_remove,
 	.shutdown = smmu_pmu_shutdown,
 };
 
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 3569050f9cf3..fd5b78732603 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -1280,7 +1280,7 @@ static struct platform_driver arm_spe_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe	= arm_spe_pmu_device_probe,
-	.remove_new = arm_spe_pmu_device_remove,
+	.remove = arm_spe_pmu_device_remove,
 };
 
 static int __init arm_spe_pmu_init(void)
diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index 746b92330ca7..b989ffa95d69 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -846,7 +846,7 @@ static struct platform_driver imx_ddr_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe          = ddr_perf_probe,
-	.remove_new     = ddr_perf_remove,
+	.remove         = ddr_perf_remove,
 };
 
 module_platform_driver(imx_ddr_pmu_driver);
diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c
index fe1a51f64751..3c856d9a4e97 100644
--- a/drivers/perf/fsl_imx9_ddr_perf.c
+++ b/drivers/perf/fsl_imx9_ddr_perf.c
@@ -853,7 +853,7 @@ static struct platform_driver imx_ddr_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe          = ddr_perf_probe,
-	.remove_new     = ddr_perf_remove,
+	.remove         = ddr_perf_remove,
 };
 module_platform_driver(imx_ddr_pmu_driver);
 
diff --git a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
index 0e923f94fa5b..3f3fb1de11f5 100644
--- a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
@@ -358,7 +358,7 @@ static struct platform_driver hisi_cpa_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_cpa_pmu_probe,
-	.remove_new = hisi_cpa_pmu_remove,
+	.remove = hisi_cpa_pmu_remove,
 };
 
 static int __init hisi_cpa_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
index b804e3738113..a6ebf2ec99d3 100644
--- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -547,7 +547,7 @@ static struct platform_driver hisi_ddrc_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_ddrc_pmu_probe,
-	.remove_new = hisi_ddrc_pmu_remove,
+	.remove = hisi_ddrc_pmu_remove,
 };
 
 static int __init hisi_ddrc_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
index 21e69b1cdd4d..32624872596f 100644
--- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -550,7 +550,7 @@ static struct platform_driver hisi_hha_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_hha_pmu_probe,
-	.remove_new = hisi_hha_pmu_remove,
+	.remove = hisi_hha_pmu_remove,
 };
 
 static int __init hisi_hha_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
index 51ba76871097..c235b46ce873 100644
--- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -584,7 +584,7 @@ static struct platform_driver hisi_l3c_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_l3c_pmu_probe,
-	.remove_new = hisi_l3c_pmu_remove,
+	.remove = hisi_l3c_pmu_remove,
 };
 
 static int __init hisi_l3c_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
index 3cdb35c741f9..c0f5d7c73e06 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
@@ -538,7 +538,7 @@ static struct platform_driver hisi_pa_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_pa_pmu_probe,
-	.remove_new = hisi_pa_pmu_remove,
+	.remove = hisi_pa_pmu_remove,
 };
 
 static int __init hisi_pa_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
index 765bbd61db26..c5f4764ee888 100644
--- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -476,7 +476,7 @@ static struct platform_driver hisi_sllc_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = hisi_sllc_pmu_probe,
-	.remove_new = hisi_sllc_pmu_remove,
+	.remove = hisi_sllc_pmu_remove,
 };
 
 static int __init hisi_sllc_pmu_module_init(void)
diff --git a/drivers/perf/marvell_cn10k_ddr_pmu.c b/drivers/perf/marvell_cn10k_ddr_pmu.c
index 94f1ebcd2a27..8860d9f687ae 100644
--- a/drivers/perf/marvell_cn10k_ddr_pmu.c
+++ b/drivers/perf/marvell_cn10k_ddr_pmu.c
@@ -732,7 +732,7 @@ static struct platform_driver cn10k_ddr_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe		= cn10k_ddr_perf_probe,
-	.remove_new	= cn10k_ddr_perf_remove,
+	.remove		= cn10k_ddr_perf_remove,
 };
 
 static int __init cn10k_ddr_pmu_init(void)
diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
index 9e635f355470..cda55ee35eee 100644
--- a/drivers/perf/marvell_cn10k_tad_pmu.c
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -383,7 +383,7 @@ static struct platform_driver tad_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe          = tad_pmu_probe,
-	.remove_new     = tad_pmu_remove,
+	.remove         = tad_pmu_remove,
 };
 
 static int tad_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index 980e3051edd7..ea8c85729937 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -981,7 +981,7 @@ static struct platform_driver l2_cache_pmu_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = l2_cache_pmu_probe,
-	.remove_new = l2_cache_pmu_remove,
+	.remove = l2_cache_pmu_remove,
 };
 
 static int __init register_l2_cache_pmu_driver(void)
diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c
index faf763d2c95c..cadd60221b8f 100644
--- a/drivers/perf/thunderx2_pmu.c
+++ b/drivers/perf/thunderx2_pmu.c
@@ -1010,7 +1010,7 @@ static struct platform_driver tx2_uncore_driver = {
 		.suppress_bind_attrs = true,
 	},
 	.probe = tx2_uncore_probe,
-	.remove_new = tx2_uncore_remove,
+	.remove = tx2_uncore_remove,
 };
 
 static int __init tx2_uncore_driver_init(void)
diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index c01466ae1e3d..33b5497bdc06 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -1943,7 +1943,7 @@ static void xgene_pmu_remove(struct platform_device *pdev)
 
 static struct platform_driver xgene_pmu_driver = {
 	.probe = xgene_pmu_probe,
-	.remove_new = xgene_pmu_remove,
+	.remove = xgene_pmu_remove,
 	.driver = {
 		.name		= "xgene-pmu",
 		.of_match_table = xgene_pmu_of_match,

From 340fd66c856651d8c1d29f392dd26ad674d2db0e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 7 Nov 2024 01:18:42 +0900
Subject: [PATCH 137/168] arm64: fix .data.rel.ro size assertion when
 CONFIG_LTO_CLANG

Commit be2881824ae9 ("arm64/build: Assert for unwanted sections")
introduced an assertion to ensure that the .data.rel.ro section does
not exist.

However, this check does not work when CONFIG_LTO_CLANG is enabled,
because .data.rel.ro matches the .data.[0-9a-zA-Z_]* pattern in the
DATA_MAIN macro.

Move the ASSERT() above the RW_DATA() line.

Fixes: be2881824ae9 ("arm64/build: Assert for unwanted sections")
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241106161843.189927-1-masahiroy@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/vmlinux.lds.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 58d89d997d05..f84c71f04d9e 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -287,6 +287,9 @@ SECTIONS
 	__initdata_end = .;
 	__init_end = .;
 
+	.data.rel.ro : { *(.data.rel.ro) }
+	ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!")
+
 	_data = .;
 	_sdata = .;
 	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
@@ -343,9 +346,6 @@ SECTIONS
 		*(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
 	}
 	ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
-
-	.data.rel.ro : { *(.data.rel.ro) }
-	ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!")
 }
 
 #include "image-vars.h"

From a3590d71a1acef850864f19bff2f37f56b2d4f00 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 30 Oct 2024 00:02:02 +0000
Subject: [PATCH 138/168] kselftest/arm64: Increase frequency of signal
 delivery in fp-stress

Currently we only deliver signals to the processes being tested about once
a second, meaning that the signal code paths are subject to relatively
little stress. Increase this frequency substantially to 25ms intervals,
along with some minor refactoring to make this more readily tuneable and
maintain the 1s logging interval. This interval was chosen based on some
experimentation with emulated platforms to avoid causing so much extra load
that the test starts to run into the 45s limit for selftests or generally
completely disconnect the timeout numbers from the

We could increase this if we moved the signal generation out of the main
supervisor thread, though we should also consider that he percentage of
time that we spend interacting with the floating point state is also a
consideration.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241030-arm64-fp-stress-interval-v2-1-bd3cef48c22c@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 26 +++++++++++---------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index 13958e645afc..b81bc0842f17 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -28,6 +28,9 @@
 
 #define MAX_VLS 16
 
+#define SIGNAL_INTERVAL_MS 25
+#define LOG_INTERVALS (1000 / SIGNAL_INTERVAL_MS)
+
 struct child_data {
 	char *name, *output;
 	pid_t pid;
@@ -448,7 +451,7 @@ static const struct option options[] = {
 int main(int argc, char **argv)
 {
 	int ret;
-	int timeout = 10;
+	int timeout = 10 * (1000 / SIGNAL_INTERVAL_MS);
 	int cpus, i, j, c;
 	int sve_vl_count, sme_vl_count;
 	bool all_children_started = false;
@@ -504,7 +507,7 @@ int main(int argc, char **argv)
 		       have_sme2 ? "present" : "absent");
 
 	if (timeout > 0)
-		ksft_print_msg("Will run for %ds\n", timeout);
+		ksft_print_msg("Will run for %d\n", timeout);
 	else
 		ksft_print_msg("Will run until terminated\n");
 
@@ -577,14 +580,14 @@ int main(int argc, char **argv)
 			break;
 
 		/*
-		 * Timeout is counted in seconds with no output, the
-		 * tests print during startup then are silent when
-		 * running so this should ensure they all ran enough
-		 * to install the signal handler, this is especially
-		 * useful in emulation where we will both be slow and
-		 * likely to have a large set of VLs.
+		 * Timeout is counted in poll intervals with no
+		 * output, the tests print during startup then are
+		 * silent when running so this should ensure they all
+		 * ran enough to install the signal handler, this is
+		 * especially useful in emulation where we will both
+		 * be slow and likely to have a large set of VLs.
 		 */
-		ret = epoll_wait(epoll_fd, evs, tests, 1000);
+		ret = epoll_wait(epoll_fd, evs, tests, SIGNAL_INTERVAL_MS);
 		if (ret < 0) {
 			if (errno == EINTR)
 				continue;
@@ -624,8 +627,9 @@ int main(int argc, char **argv)
 			all_children_started = true;
 		}
 
-		ksft_print_msg("Sending signals, timeout remaining: %d\n",
-			       timeout);
+		if ((timeout % LOG_INTERVALS) == 0)
+			ksft_print_msg("Sending signals, timeout remaining: %d\n",
+				       timeout);
 
 		for (i = 0; i < num_children; i++)
 			child_tickle(&children[i]);

From 161e9925053cafa83a1eb265a001b6917dfafa29 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 30 Oct 2024 00:02:03 +0000
Subject: [PATCH 139/168] kselftest/arm64: Poll less often while waiting for
 fp-stress children

While fp-stress is waiting for children to start it doesn't send any
signals to them so there is no need for it to have as short an epoll()
timeout as it does when the children are all running. We do still want to
have some timeout so that we can log diagnostics about missing children but
this can be relatively large. On emulated platforms the overhead of running
the supervisor process is quite high, especially during the process of
execing the test binaries.

Implement a longer epoll() timeout during the setup phase, using a 5s
timeout while waiting for children and switching  to the signal raise
interval when all the children are started and we start sending signals.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20241030-arm64-fp-stress-interval-v2-2-bd3cef48c22c@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index b81bc0842f17..ad867ff9687a 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -452,6 +452,7 @@ int main(int argc, char **argv)
 {
 	int ret;
 	int timeout = 10 * (1000 / SIGNAL_INTERVAL_MS);
+	int poll_interval = 5000;
 	int cpus, i, j, c;
 	int sve_vl_count, sme_vl_count;
 	bool all_children_started = false;
@@ -587,7 +588,7 @@ int main(int argc, char **argv)
 		 * especially useful in emulation where we will both
 		 * be slow and likely to have a large set of VLs.
 		 */
-		ret = epoll_wait(epoll_fd, evs, tests, SIGNAL_INTERVAL_MS);
+		ret = epoll_wait(epoll_fd, evs, tests, poll_interval);
 		if (ret < 0) {
 			if (errno == EINTR)
 				continue;
@@ -625,6 +626,7 @@ int main(int argc, char **argv)
 			}
 
 			all_children_started = true;
+			poll_interval = SIGNAL_INTERVAL_MS;
 		}
 
 		if ((timeout % LOG_INTERVALS) == 0)

From 94de486e4215b83de8a5e908b4b8f7d3979c3b0e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:20 +0000
Subject: [PATCH 140/168] kselftest/arm64: Correct misleading comments on
 fp-stress irritators

The comments in the handlers for the irritator signal in the test threads
for fp-stress suggest that the irritator will corrupt the register state
observed by the main thread but this is not the case, instead the FPSIMD
and SVE irritators (which are the only ones that are implemented) modify
the current register state which is expected to be overwritten on return
from the handler by the saved register state. Update the comment to reflect
what the handler is actually doing.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-1-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fpsimd-test.S | 3 +--
 tools/testing/selftests/arm64/fp/sve-test.S    | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index 8b960d01ed2e..bdfb7cf2e4ec 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -134,8 +134,7 @@ function check_vreg
 	b	memcmp
 endfunction
 
-// Any SVE register modified here can cause corruption in the main
-// thread -- but *only* the registers modified here.
+// Modify live register state, the signal return will undo our changes
 function irritator_handler
 	// Increment the irritation signal count (x23):
 	ldr	x0, [x2, #ucontext_regs + 8 * 23]
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index fff60e2a25ad..e3c0d585684d 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -291,8 +291,7 @@ function check_ffr
 #endif
 endfunction
 
-// Any SVE register modified here can cause corruption in the main
-// thread -- but *only* the registers modified here.
+// Modify live register state, the signal return will undo our changes
 function irritator_handler
 	// Increment the irritation signal count (x23):
 	ldr	x0, [x2, #ucontext_regs + 8 * 23]

From ffca567fef9c4661c792ca48a2cdd038b2df4887 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:21 +0000
Subject: [PATCH 141/168] kselftest/arm64: Remove unused ADRs from irritator
 handlers

The irritator handlers for the fp-stress test programs all use ADR to load
an address into x0 which is then not referenced. Remove these ADRs as they
just cause confusion.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-2-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fpsimd-test.S | 1 -
 tools/testing/selftests/arm64/fp/sve-test.S    | 1 -
 tools/testing/selftests/arm64/fp/za-test.S     | 1 -
 tools/testing/selftests/arm64/fp/zt-test.S     | 1 -
 4 files changed, 4 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index bdfb7cf2e4ec..9977ffdd758a 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -142,7 +142,6 @@ function irritator_handler
 	str	x0, [x2, #ucontext_regs + 8 * 23]
 
 	// Corrupt some random V-regs
-	adr	x0, .text + (irritator_handler - .text) / 16 * 16
 	movi	v0.8b, #7
 	movi	v9.16b, #9
 	movi	v31.8b, #31
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index e3c0d585684d..f1fb9745c681 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -299,7 +299,6 @@ function irritator_handler
 	str	x0, [x2, #ucontext_regs + 8 * 23]
 
 	// Corrupt some random Z-regs
-	adr	x0, .text + (irritator_handler - .text) / 16 * 16
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3
diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index 095b45531640..1ee0ec36766d 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -158,7 +158,6 @@ function irritator_handler
 
 	// Corrupt some random ZA data
 #if 0
-	adr	x0, .text + (irritator_handler - .text) / 16 * 16
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index b5c81e81a379..ade9c98abcda 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -127,7 +127,6 @@ function irritator_handler
 
 	// Corrupt some random ZT data
 #if 0
-	adr	x0, .text + (irritator_handler - .text) / 16 * 16
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3

From d65f27d240bb897ac5a4a3fb7557724b3717e022 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:23 +0000
Subject: [PATCH 142/168] kselftest/arm64: Implement irritators for ZA and ZT

Currently we don't use the irritator signal in our floating point stress
tests so when we added ZA and ZT stress tests we didn't actually bother
implementing any actual action in the handlers, we just counted the signal
deliveries. In preparation for using the irritators let's implement them,
just trivially SMSTOP and SMSTART to reset all bits in the register to 0.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-4-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/za-test.S | 12 ++++--------
 tools/testing/selftests/arm64/fp/zt-test.S | 12 ++++--------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index 1ee0ec36766d..f902e6ef9077 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -148,20 +148,16 @@ function check_za
 	b	memcmp
 endfunction
 
-// Any SME register modified here can cause corruption in the main
-// thread -- but *only* the locations modified here.
+// Modify the live SME register state, signal return will undo our changes
 function irritator_handler
 	// Increment the irritation signal count (x23):
 	ldr	x0, [x2, #ucontext_regs + 8 * 23]
 	add	x0, x0, #1
 	str	x0, [x2, #ucontext_regs + 8 * 23]
 
-	// Corrupt some random ZA data
-#if 0
-	movi	v0.8b, #1
-	movi	v9.16b, #2
-	movi	v31.8b, #3
-#endif
+	// This will reset ZA to all bits 0
+	smstop
+	smstart
 
 	ret
 endfunction
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index ade9c98abcda..c96cb7c2ad4b 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -117,20 +117,16 @@ function check_zt
 	b	memcmp
 endfunction
 
-// Any SME register modified here can cause corruption in the main
-// thread -- but *only* the locations modified here.
+// Modify the live SME register state, signal return will undo our changes
 function irritator_handler
 	// Increment the irritation signal count (x23):
 	ldr	x0, [x2, #ucontext_regs + 8 * 23]
 	add	x0, x0, #1
 	str	x0, [x2, #ucontext_regs + 8 * 23]
 
-	// Corrupt some random ZT data
-#if 0
-	movi	v0.8b, #1
-	movi	v9.16b, #2
-	movi	v31.8b, #3
-#endif
+	// This will reset ZT to all bits 0
+	smstop
+	smstart
 
 	ret
 endfunction

From 7368debf275aa2419365ce47da00922ca51c6094 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:24 +0000
Subject: [PATCH 143/168] kselftest/arm64: Provide a SIGUSR1 handler in the
 kernel mode FP stress test

The other stress test programs provide a SIGUSR1 handler which modifies the
live register state in order to validate that signal context is being
restored during signal return. While we can't usefully do this when testing
kernel mode FP usage provide a handler for SIGUSR1 which just counts the
number of signals like we do for SIGUSR2, allowing fp-stress to treat all
the test programs uniformly.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-5-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/kernel-test.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c
index e8da3b4cbd23..859345379044 100644
--- a/tools/testing/selftests/arm64/fp/kernel-test.c
+++ b/tools/testing/selftests/arm64/fp/kernel-test.c
@@ -267,6 +267,10 @@ int main(void)
 		       strerror(errno), errno);
 
 	sa.sa_sigaction = handle_kick_signal;
+	ret = sigaction(SIGUSR1, &sa, NULL);
+	if (ret < 0)
+		printf("Failed to install SIGUSR1 handler: %s (%d)\n",
+		       strerror(errno), errno);
 	ret = sigaction(SIGUSR2, &sa, NULL);
 	if (ret < 0)
 		printf("Failed to install SIGUSR2 handler: %s (%d)\n",

From ead1c35ce3b3c766443a82b56ee343cfe7ee8305 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:25 +0000
Subject: [PATCH 144/168] kselftest/arm64: Test signal handler state
 modification in fp-stress

Currently in fp-stress we test signal delivery to the test threads by
sending SIGUSR2 which simply counts how many signals are delivered. The
test programs now also all have a SIGUSR1 handler which for the threads
doing userspace testing additionally modifies the floating point register
state in the signal handler, verifying that when we return the saved
register state is restored from the signal context as expected. Switch over
to triggering that to validate that we are restoring as expected.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-6-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-stress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index ad867ff9687a..74e23208b94c 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -223,7 +223,7 @@ static void child_output(struct child_data *child, uint32_t events,
 static void child_tickle(struct child_data *child)
 {
 	if (child->output_seen && !child->exited)
-		kill(child->pid, SIGUSR2);
+		kill(child->pid, SIGUSR1);
 }
 
 static void child_stop(struct child_data *child)

From db64dfffcad2992d6bfc680822bdf715335c43f1 Mon Sep 17 00:00:00 2001
From: Kevin Brodsky <kevin.brodsky@arm.com>
Date: Thu, 7 Nov 2024 13:16:40 +0000
Subject: [PATCH 145/168] selftests/mm: Define PKEY_UNRESTRICTED for
 pkey_sighandler_tests

Commit 6e182dc9f268 ("selftests/mm: Use generic pkey register
manipulation") makes use of PKEY_UNRESTRICTED in
pkey_sighandler_tests. The macro has been proposed for addition to
uapi headers [1], but the patch hasn't landed yet.

Define PKEY_UNRESTRICTED in pkey-helpers.h for the time being to fix
the build.

[1] https://lore.kernel.org/all/20241028090715.509527-2-yury.khrustalev@arm.com/

Fixes: 6e182dc9f268 ("selftests/mm: Use generic pkey register manipulation")
Reported-by: Aishwarya TCV <aishwarya.tcv@arm.com>
Signed-off-by: Kevin Brodsky <kevin.brodsky@arm.com>
Link: https://lore.kernel.org/r/20241107131640.650703-1-kevin.brodsky@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/mm/pkey-helpers.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 9ab6a3ee153b..f7cfe163b0ff 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -112,6 +112,13 @@ void record_pkey_malloc(void *ptr, long size, int prot);
 #define PKEY_MASK	(PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
 #endif
 
+/*
+ * FIXME: Remove once the generic PKEY_UNRESTRICTED definition is merged.
+ */
+#ifndef PKEY_UNRESTRICTED
+#define PKEY_UNRESTRICTED 0x0
+#endif
+
 #ifndef set_pkey_bits
 static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
 {

From 1a9de2f6fda69d5f105dd8af776856a66abdaa64 Mon Sep 17 00:00:00 2001
From: Aleksandr Mishin <amishin@t-argos.ru>
Date: Tue, 27 Aug 2024 13:12:39 +0300
Subject: [PATCH 146/168] acpi/arm64: Adjust error handling procedure in
 gtdt_parse_timer_block()

In case of error in gtdt_parse_timer_block() invalid 'gtdt_frame'
will be used in 'do {} while (i-- >= 0 && gtdt_frame--);' statement block
because do{} block will be executed even if 'i == 0'.

Adjust error handling procedure by replacing 'i-- >= 0' with 'i-- > 0'.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Fixes: a712c3ed9b8a ("acpi/arm64: Add memory-mapped timer support in GTDT driver")
Signed-off-by: Aleksandr Mishin <amishin@t-argos.ru>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Aleksandr Mishin <amishin@t-argos.ru>
Link: https://lore.kernel.org/r/20240827101239.22020-1-amishin@t-argos.ru
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/gtdt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index d7c4e1b9915b..81dc0582743d 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -294,7 +294,7 @@ error:
 		if (frame->virt_irq > 0)
 			acpi_unregister_gsi(gtdt_frame->virtual_timer_interrupt);
 		frame->virt_irq = 0;
-	} while (i-- >= 0 && gtdt_frame--);
+	} while (i-- > 0 && gtdt_frame--);
 
 	return -EINVAL;
 }

From bdf94836c22abc923af5ae83709c96ff4c3c77c9 Mon Sep 17 00:00:00 2001
From: Liao Chang <liaochang1@huawei.com>
Date: Thu, 19 Sep 2024 12:17:19 +0000
Subject: [PATCH 147/168] arm64: uprobes: Optimize cache flushes for xol slot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The profiling of single-thread selftests bench reveals a bottlenect in
caches_clean_inval_pou() on ARM64. On my local testing machine, this
function takes approximately 34% of CPU cycles for trig-uprobe-nop and
trig-uprobe-push.

This patch add a check to avoid unnecessary cache flush when writing
instruction to the xol slot. If the instruction is same with the
existing instruction in slot, there is no need to synchronize D/I cache.
Since xol slot allocation and updates occur on the hot path of uprobe
handling, The upstream kernel running on Kunpeng916 (Hi1616), 4 NUMA
nodes, 64 cores@ 2.4GHz reveals this optimization has obvious gain for
nop and push testcases.

Before (next-20240918)
----------------------
uprobe-nop      ( 1 cpus):    0.418 ± 0.001M/s  (  0.418M/s/cpu)
uprobe-push     ( 1 cpus):    0.411 ± 0.005M/s  (  0.411M/s/cpu)
uprobe-ret      ( 1 cpus):    2.052 ± 0.002M/s  (  2.052M/s/cpu)
uretprobe-nop   ( 1 cpus):    0.350 ± 0.000M/s  (  0.350M/s/cpu)
uretprobe-push  ( 1 cpus):    0.353 ± 0.000M/s  (  0.353M/s/cpu)
uretprobe-ret   ( 1 cpus):    1.074 ± 0.001M/s  (  1.074M/s/cpu)

After
-----
uprobe-nop      ( 1 cpus):    0.926 ± 0.000M/s  (  0.926M/s/cpu)
uprobe-push     ( 1 cpus):    0.910 ± 0.001M/s  (  0.910M/s/cpu)
uprobe-ret      ( 1 cpus):    2.056 ± 0.001M/s  (  2.056M/s/cpu)
uretprobe-nop   ( 1 cpus):    0.653 ± 0.001M/s  (  0.653M/s/cpu)
uretprobe-push  ( 1 cpus):    0.645 ± 0.000M/s  (  0.645M/s/cpu)
uretprobe-ret   ( 1 cpus):    1.093 ± 0.001M/s  (  1.093M/s/cpu)

Signed-off-by: Liao Chang <liaochang1@huawei.com>
Link: https://lore.kernel.org/r/20240919121719.2148361-1-liaochang1@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/probes/uprobes.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index d49aef2657cd..ba61651db4b5 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -17,12 +17,20 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 	void *xol_page_kaddr = kmap_atomic(page);
 	void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK);
 
+	/*
+	 * Initial cache maintenance of the xol page done via set_pte_at().
+	 * Subsequent CMOs only needed if the xol slot changes.
+	 */
+	if (!memcmp(dst, src, len))
+		goto done;
+
 	/* Initialize the slot */
 	memcpy(dst, src, len);
 
 	/* flush caches (dcache/icache) */
 	sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len);
 
+done:
 	kunmap_atomic(xol_page_kaddr);
 }
 

From ccf54058f5328e6da5ecc61c961bf68bbb7d865b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 6 Nov 2024 19:55:15 +0100
Subject: [PATCH 148/168] arm64/scs: Fix handling of DWARF augmentation data in
 CIE/FDE frames

The dynamic SCS patching code pretends to parse the DWARF augmentation
data in the CIE (header) frame, and handle accordingly when processing
the individual FDE frames based on this CIE frame. However, the boolean
variable is defined inside the loop, and so the parsed value is ignored.

The same applies to the code alignment field, which is also read from
the header but then discarded.

This was never spotted before because Clang is the only compiler that
supports dynamic SCS patching (which is essentially an Android feature),
and the unwind tables it produces are highly uniform, and match the
de facto defaults.

So instead of testing for the 'z' flag in the augmentation data field,
require a fixed augmentation data string of 'zR', and simplify the rest
of the code accordingly.

Also introduce some error codes to specify why the patching failed, and
log it to the kernel console on failure when this happens when loading a
module. (Doing so for vmlinux is infeasible, as the patching is done
extremely early in the boot.)

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Tested-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://lore.kernel.org/r/20241106185513.3096442-6-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/scs.h     |  7 ++++
 arch/arm64/kernel/module.c       | 10 ++++-
 arch/arm64/kernel/pi/patch-scs.c | 63 ++++++++++++++++++--------------
 3 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index 2e010ea76be2..934e9217cd74 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -46,6 +46,13 @@ static inline void dynamic_scs_init(void)
 static inline void dynamic_scs_init(void) {}
 #endif
 
+enum {
+	EDYNSCS_INVALID_CIE_HEADER		= 1,
+	EDYNSCS_INVALID_CIE_SDATA_SIZE		= 2,
+	EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE	= 3,
+	EDYNSCS_INVALID_CFA_OPCODE		= 4,
+};
+
 int __pi_scs_patch(const u8 eh_frame[], int size);
 asmlinkage void __pi_scs_patch_vmlinux(void);
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 36b25af56324..06bb680bfe97 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -462,14 +462,20 @@ int module_finalize(const Elf_Ehdr *hdr,
 		    struct module *me)
 {
 	const Elf_Shdr *s;
+	int ret;
+
 	s = find_section(hdr, sechdrs, ".altinstructions");
 	if (s)
 		apply_alternatives_module((void *)s->sh_addr, s->sh_size);
 
 	if (scs_is_dynamic()) {
 		s = find_section(hdr, sechdrs, ".init.eh_frame");
-		if (s)
-			__pi_scs_patch((void *)s->sh_addr, s->sh_size);
+		if (s) {
+			ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size);
+			if (ret)
+				pr_err("module %s: error occurred during dynamic SCS patching (%d)\n",
+				       me->name, ret);
+		}
 	}
 
 	return module_init_ftrace_plt(hdr, sechdrs, me);
diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c
index 49d8b40e61bc..cec8f0a52bbc 100644
--- a/arch/arm64/kernel/pi/patch-scs.c
+++ b/arch/arm64/kernel/pi/patch-scs.c
@@ -120,7 +120,11 @@ struct eh_frame {
 	union {
 		struct { // CIE
 			u8	version;
-			u8	augmentation_string[];
+			u8	augmentation_string[3];
+			u8	code_alignment_factor;
+			u8	data_alignment_factor;
+			u8	return_address_register;
+			u8	augmentation_data_size;
 		};
 
 		struct { // FDE
@@ -132,25 +136,21 @@ struct eh_frame {
 };
 
 static int scs_handle_fde_frame(const struct eh_frame *frame,
-				bool fde_has_augmentation_data,
 				int code_alignment_factor,
 				bool dry_run)
 {
 	int size = frame->size - offsetof(struct eh_frame, opcodes) + 4;
 	u64 loc = (u64)offset_to_ptr(&frame->initial_loc);
 	const u8 *opcode = frame->opcodes;
+	int l;
 
-	if (fde_has_augmentation_data) {
-		int l;
+	// assume single byte uleb128_t for augmentation data size
+	if (*opcode & BIT(7))
+		return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE;
 
-		// assume single byte uleb128_t
-		if (WARN_ON(*opcode & BIT(7)))
-			return -ENOEXEC;
-
-		l = *opcode++;
-		opcode += l;
-		size -= l + 1;
-	}
+	l = *opcode++;
+	opcode += l;
+	size -= l + 1;
 
 	/*
 	 * Starting from 'loc', apply the CFA opcodes that advance the location
@@ -201,7 +201,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
 			break;
 
 		default:
-			return -ENOEXEC;
+			return EDYNSCS_INVALID_CFA_OPCODE;
 		}
 	}
 	return 0;
@@ -209,12 +209,11 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
 
 int scs_patch(const u8 eh_frame[], int size)
 {
+	int code_alignment_factor = 1;
 	const u8 *p = eh_frame;
 
 	while (size > 4) {
 		const struct eh_frame *frame = (const void *)p;
-		bool fde_has_augmentation_data = true;
-		int code_alignment_factor = 1;
 		int ret;
 
 		if (frame->size == 0 ||
@@ -223,28 +222,36 @@ int scs_patch(const u8 eh_frame[], int size)
 			break;
 
 		if (frame->cie_id_or_pointer == 0) {
-			const u8 *p = frame->augmentation_string;
-
-			/* a 'z' in the augmentation string must come first */
-			fde_has_augmentation_data = *p == 'z';
+			/*
+			 * Require presence of augmentation data (z) with a
+			 * specifier for the size of the FDE initial_loc and
+			 * range fields (R), and nothing else.
+			 */
+			if (strcmp(frame->augmentation_string, "zR"))
+				return EDYNSCS_INVALID_CIE_HEADER;
 
 			/*
 			 * The code alignment factor is a uleb128 encoded field
 			 * but given that the only sensible values are 1 or 4,
-			 * there is no point in decoding the whole thing.
+			 * there is no point in decoding the whole thing.  Also
+			 * sanity check the size of the data alignment factor
+			 * field, and the values of the return address register
+			 * and augmentation data size fields.
 			 */
-			p += strlen(p) + 1;
-			if (!WARN_ON(*p & BIT(7)))
-				code_alignment_factor = *p;
+			if ((frame->code_alignment_factor & BIT(7)) ||
+			    (frame->data_alignment_factor & BIT(7)) ||
+			    frame->return_address_register != 30 ||
+			    frame->augmentation_data_size != 1)
+				return EDYNSCS_INVALID_CIE_HEADER;
+
+			code_alignment_factor = frame->code_alignment_factor;
 		} else {
-			ret = scs_handle_fde_frame(frame,
-						   fde_has_augmentation_data,
-						   code_alignment_factor,
+			ret = scs_handle_fde_frame(frame, code_alignment_factor,
 						   true);
 			if (ret)
 				return ret;
-			scs_handle_fde_frame(frame, fde_has_augmentation_data,
-					     code_alignment_factor, false);
+			scs_handle_fde_frame(frame, code_alignment_factor,
+					     false);
 		}
 
 		p += sizeof(frame->size) + frame->size;

From 60de7a647fc51b8467f6fb1a8abfe4cab1c95687 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 6 Nov 2024 19:55:16 +0100
Subject: [PATCH 149/168] arm64/scs: Deal with 64-bit relative offsets in FDE
 frames

In some cases, the compiler may decide to emit DWARF FDE frames with
64-bit signed fields for the code offset and range fields. This may
happen when using the large code model, for instance, which permits
an executable to be spread out over more than 4 GiB of address space.

Whether this is the case can be inferred from the augmentation data in
the CIE frame, so decode this data before processing the FDE frames.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Tested-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://lore.kernel.org/r/20241106185513.3096442-7-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/pi/patch-scs.c | 34 ++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c
index cec8f0a52bbc..55d0cd64ef71 100644
--- a/arch/arm64/kernel/pi/patch-scs.c
+++ b/arch/arm64/kernel/pi/patch-scs.c
@@ -50,6 +50,10 @@ bool dynamic_scs_is_enabled;
 #define DW_CFA_GNU_negative_offset_extended 0x2f
 #define DW_CFA_hi_user                      0x3f
 
+#define DW_EH_PE_sdata4                     0x0b
+#define DW_EH_PE_sdata8                     0x0c
+#define DW_EH_PE_pcrel                      0x10
+
 enum {
 	PACIASP		= 0xd503233f,
 	AUTIASP		= 0xd50323bf,
@@ -125,6 +129,7 @@ struct eh_frame {
 			u8	data_alignment_factor;
 			u8	return_address_register;
 			u8	augmentation_data_size;
+			u8	fde_pointer_format;
 		};
 
 		struct { // FDE
@@ -132,11 +137,18 @@ struct eh_frame {
 			s32	range;
 			u8	opcodes[];
 		};
+
+		struct { // FDE
+			s64	initial_loc64;
+			s64	range64;
+			u8	opcodes64[];
+		};
 	};
 };
 
 static int scs_handle_fde_frame(const struct eh_frame *frame,
 				int code_alignment_factor,
+				bool use_sdata8,
 				bool dry_run)
 {
 	int size = frame->size - offsetof(struct eh_frame, opcodes) + 4;
@@ -144,6 +156,12 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
 	const u8 *opcode = frame->opcodes;
 	int l;
 
+	if (use_sdata8) {
+		loc = (u64)&frame->initial_loc64 + frame->initial_loc64;
+		opcode = frame->opcodes64;
+		size -= 8;
+	}
+
 	// assume single byte uleb128_t for augmentation data size
 	if (*opcode & BIT(7))
 		return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE;
@@ -210,6 +228,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
 int scs_patch(const u8 eh_frame[], int size)
 {
 	int code_alignment_factor = 1;
+	bool fde_use_sdata8 = false;
 	const u8 *p = eh_frame;
 
 	while (size > 4) {
@@ -245,13 +264,24 @@ int scs_patch(const u8 eh_frame[], int size)
 				return EDYNSCS_INVALID_CIE_HEADER;
 
 			code_alignment_factor = frame->code_alignment_factor;
+
+			switch (frame->fde_pointer_format) {
+			case DW_EH_PE_pcrel | DW_EH_PE_sdata4:
+				fde_use_sdata8 = false;
+				break;
+			case DW_EH_PE_pcrel | DW_EH_PE_sdata8:
+				fde_use_sdata8 = true;
+				break;
+			default:
+				return EDYNSCS_INVALID_CIE_SDATA_SIZE;
+			}
 		} else {
 			ret = scs_handle_fde_frame(frame, code_alignment_factor,
-						   true);
+						   fde_use_sdata8, true);
 			if (ret)
 				return ret;
 			scs_handle_fde_frame(frame, code_alignment_factor,
-					     false);
+					     fde_use_sdata8, false);
 		}
 
 		p += sizeof(frame->size) + frame->size;

From 47965a49a2c88ea850a30190b5eacd2d50f51a4b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 6 Nov 2024 19:55:17 +0100
Subject: [PATCH 150/168] arm64/scs: Drop unused prototype
 __pi_scs_patch_vmlinux()

The function scs_patch_vmlinux() was removed in the LPA2 boot code
refactoring so remove the declaration as well.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Tested-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://lore.kernel.org/r/20241106185513.3096442-8-ardb+git@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/scs.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index 934e9217cd74..a76f9b387a26 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -54,7 +54,6 @@ enum {
 };
 
 int __pi_scs_patch(const u8 eh_frame[], int size);
-asmlinkage void __pi_scs_patch_vmlinux(void);
 
 #endif /* __ASSEMBLY __ */
 

From ae465d9ca192f582cf4932e628a25f9625a8bf83 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 8 Nov 2024 15:20:46 +0000
Subject: [PATCH 151/168] kselftest/arm64: Fix build with stricter assemblers

While some assemblers (including the LLVM assembler I mostly use) will
happily accept SMSTART as an instruction by default others, specifically
gas, require that any architecture extensions be explicitly enabled.
The assembler SME test programs use manually encoded helpers for the new
instructions but no SMSTART helper is defined, only SM and ZA specific
variants.  Unfortunately the irritators that were just added use plain
SMSTART so on stricter assemblers these fail to build:

za-test.S:160: Error: selected processor does not support `smstart'

Switch to using SMSTART ZA via the manually encoded smstart_za macro we
already have defined.

Fixes: d65f27d240bb ("kselftest/arm64: Implement irritators for ZA and ZT")
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241108-arm64-selftest-asm-error-v1-1-7ce27b42a677@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/za-test.S | 2 +-
 tools/testing/selftests/arm64/fp/zt-test.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index f902e6ef9077..5abcb8f009f8 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -157,7 +157,7 @@ function irritator_handler
 
 	// This will reset ZA to all bits 0
 	smstop
-	smstart
+	smstart_za
 
 	ret
 endfunction
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index c96cb7c2ad4b..7b9de8d2a873 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -126,7 +126,7 @@ function irritator_handler
 
 	// This will reset ZT to all bits 0
 	smstop
-	smstart
+	smstart_za
 
 	ret
 endfunction

From b6bd50dd3b564d50b8cd748de6bae58804ecb768 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 8 Nov 2024 13:49:17 +0000
Subject: [PATCH 152/168] kselftest/arm64: Fix printf() compiler warnings in
 the arm64 fp tests

Lots of incorrect length modifiers, missing arguments or conversion
specifiers. Fix them.

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Mark Brown <broonie@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241108134920.1233992-2-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-ptrace.c | 16 +++++++++-------
 tools/testing/selftests/arm64/fp/za-ptrace.c  |  8 +++++---
 tools/testing/selftests/arm64/fp/zt-ptrace.c  |  8 +++++---
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index 6d61992fe8a0..577b6e05e860 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -82,10 +82,12 @@ static void fill_buf(char *buf, size_t size)
 static int do_child(void)
 {
 	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
-		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+		ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	if (raise(SIGSTOP))
-		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+		ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	return EXIT_SUCCESS;
 }
@@ -340,7 +342,7 @@ static void ptrace_set_sve_get_sve_data(pid_t child,
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+		ksft_test_result_fail("Error allocating %ld byte buffer for %s VL %u\n",
 				      data_size, type->name, vl);
 		return;
 	}
@@ -441,7 +443,7 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child,
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+		ksft_test_result_fail("Error allocating %ld byte buffer for %s VL %u\n",
 				      data_size, type->name, vl);
 		return;
 	}
@@ -545,7 +547,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
 	read_sve = read_buf;
 
 	if (read_sve->vl != vl) {
-		ksft_test_result_fail("Child VL != expected VL %d\n",
+		ksft_test_result_fail("Child VL != expected VL: %u != %u\n",
 				      read_sve->vl, vl);
 		goto out;
 	}
@@ -555,7 +557,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
 	case SVE_PT_REGS_FPSIMD:
 		expected_size = SVE_PT_FPSIMD_SIZE(vq, SVE_PT_REGS_FPSIMD);
 		if (read_sve_size < expected_size) {
-			ksft_test_result_fail("Read %d bytes, expected %d\n",
+			ksft_test_result_fail("Read %ld bytes, expected %ld\n",
 					      read_sve_size, expected_size);
 			goto out;
 		}
@@ -571,7 +573,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
 	case SVE_PT_REGS_SVE:
 		expected_size = SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 		if (read_sve_size < expected_size) {
-			ksft_test_result_fail("Read %d bytes, expected %d\n",
+			ksft_test_result_fail("Read %ld bytes, expected %ld\n",
 					      read_sve_size, expected_size);
 			goto out;
 		}
diff --git a/tools/testing/selftests/arm64/fp/za-ptrace.c b/tools/testing/selftests/arm64/fp/za-ptrace.c
index ac27d87396fc..08c777f87ea2 100644
--- a/tools/testing/selftests/arm64/fp/za-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/za-ptrace.c
@@ -48,10 +48,12 @@ static void fill_buf(char *buf, size_t size)
 static int do_child(void)
 {
 	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
-		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+		ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)",
+				   strerror(errno), errno);
 
 	if (raise(SIGSTOP))
-		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+		ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	return EXIT_SUCCESS;
 }
@@ -201,7 +203,7 @@ static void ptrace_set_get_data(pid_t child, unsigned int vl)
 	data_size = ZA_PT_SIZE(vq);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
+		ksft_test_result_fail("Error allocating %ld byte buffer for VL %u\n",
 				      data_size, vl);
 		return;
 	}
diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c
index 996d9614a131..584b8d59b7ea 100644
--- a/tools/testing/selftests/arm64/fp/zt-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c
@@ -43,10 +43,12 @@ static void fill_buf(char *buf, size_t size)
 static int do_child(void)
 {
 	if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
-		ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+		ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	if (raise(SIGSTOP))
-		ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+		ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+				   strerror(errno), errno);
 
 	return EXIT_SUCCESS;
 }
@@ -231,7 +233,7 @@ static void ptrace_enable_za_via_zt(pid_t child)
 		/* Should have register data */
 		if (za_out->size < ZA_PT_SIZE(vq)) {
 			ksft_print_msg("ZA data less than expected: %u < %u\n",
-				       za_out->size, ZA_PT_SIZE(vq));
+				       za_out->size, (unsigned int)ZA_PT_SIZE(vq));
 			fail = true;
 			vq = 0;
 		}

From 0cc6b94a445c53ab152554b4cf60575e1396adf6 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 8 Nov 2024 13:49:18 +0000
Subject: [PATCH 153/168] kselftest/arm64: Fix printf() warning in the arm64
 MTE prctl() test

While prctl() returns an 'int', the PR_MTE_TCF_MASK is defined as
unsigned long which results in the larger type following a bitwise 'and'
operation. Cast the printf() argument to 'int'.

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241108134920.1233992-3-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/mte/check_prctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/mte/check_prctl.c b/tools/testing/selftests/arm64/mte/check_prctl.c
index f139a33a43ef..4c89e9538ca0 100644
--- a/tools/testing/selftests/arm64/mte/check_prctl.c
+++ b/tools/testing/selftests/arm64/mte/check_prctl.c
@@ -85,7 +85,7 @@ void set_mode_test(const char *name, int hwcap2, int mask)
 		ksft_test_result_pass("%s\n", name);
 	} else {
 		ksft_print_msg("Got %x, expected %x\n",
-			       (ret & PR_MTE_TCF_MASK), mask);
+			       (ret & (int)PR_MTE_TCF_MASK), mask);
 		ksft_test_result_fail("%s\n", name);
 	}
 }

From 694e2803fece8d066bd85ce8607c630ce2b69859 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 8 Nov 2024 13:49:19 +0000
Subject: [PATCH 154/168] kselftest/arm64: Fix printf() compiler warnings in
 the arm64 syscall-abi.c tests

Fix the incorrect length modifiers in arm64/abi/syscall-abi.c.

Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Mark Brown <broonie@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241108134920.1233992-4-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/syscall-abi.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index d704511a0955..5ec9a18ec802 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -81,7 +81,7 @@ static int check_gpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl, uint64_t s
 	 */
 	for (i = 9; i < ARRAY_SIZE(gpr_in); i++) {
 		if (gpr_in[i] != gpr_out[i]) {
-			ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %llx != %llx\n",
+			ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %lx != %lx\n",
 				       cfg->name, sve_vl, i,
 				       gpr_in[i], gpr_out[i]);
 			errors++;
@@ -112,7 +112,7 @@ static int check_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 	if (!sve_vl && !(svcr & SVCR_SM_MASK)) {
 		for (i = 0; i < ARRAY_SIZE(fpr_in); i++) {
 			if (fpr_in[i] != fpr_out[i]) {
-				ksft_print_msg("%s Q%d/%d mismatch %llx != %llx\n",
+				ksft_print_msg("%s Q%d/%d mismatch %lx != %lx\n",
 					       cfg->name,
 					       i / 2, i % 2,
 					       fpr_in[i], fpr_out[i]);
@@ -294,13 +294,13 @@ static int check_svcr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
 	int errors = 0;
 
 	if (svcr_out & SVCR_SM_MASK) {
-		ksft_print_msg("%s Still in SM, SVCR %llx\n",
+		ksft_print_msg("%s Still in SM, SVCR %lx\n",
 			       cfg->name, svcr_out);
 		errors++;
 	}
 
 	if ((svcr_in & SVCR_ZA_MASK) != (svcr_out & SVCR_ZA_MASK)) {
-		ksft_print_msg("%s PSTATE.ZA changed, SVCR %llx != %llx\n",
+		ksft_print_msg("%s PSTATE.ZA changed, SVCR %lx != %lx\n",
 			       cfg->name, svcr_in, svcr_out);
 		errors++;
 	}

From 929bbc16abfb0144db7ac619c77f60b188e555ab Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 8 Nov 2024 11:05:49 +0000
Subject: [PATCH 155/168] selftests/mm: Fix unused function warning for
 aarch64_write_signal_pkey()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 49f59573e9e0 ("selftests/mm: Enable pkey_sighandler_tests
on arm64"), pkey_sighandler_tests.c (which includes pkey-arm64.h via
pkey-helpers.h) ends up compiled for arm64. Since it doesn't use
aarch64_write_signal_pkey(), the compiler warns:

In file included from pkey-helpers.h:106,
                 from pkey_sighandler_tests.c:31:
pkey-arm64.h:130:13: warning: ‘aarch64_write_signal_pkey’ defined but not used [-Wunused-function]
  130 | static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~

Make the aarch64_write_signal_pkey() a 'static inline void' function to
avoid the compiler warning.

Fixes: f5b5ea51f78f ("selftests: mm: make protection_keys test work on arm64")
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Joey Gouly <joey.gouly@arm.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Link: https://lore.kernel.org/r/20241108110549.1185923-1-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/mm/pkey-arm64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
index d57fbeace38f..d9d2100eafc0 100644
--- a/tools/testing/selftests/mm/pkey-arm64.h
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -127,7 +127,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey)
 	return 0;
 }
 
-static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
+static inline void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
 {
 	struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt);
 	struct poe_context *poe_ctx =

From 116e50d6474e82579086c0397d2fa3999815f29e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Nov 2024 17:07:51 +0000
Subject: [PATCH 156/168] kselftest/arm64: Check that SVCR is 0 in signal
 handlers

We don't currently validate that we exit streaming mode and clear ZA when
we enter a signal handler. Add simple checks for this in the SSVE and ZA
tests.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241106-arm64-fpmr-signal-test-v1-1-31fa34ce58fe@kernel.org
[catalin.marinas@arm.com: Use %lx in fprintf() as uint64_t seems to be unsigned long in glibc]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/signal/sve_helpers.h  | 13 +++++++++++++
 .../selftests/arm64/signal/testcases/ssve_regs.c    |  5 +++++
 .../selftests/arm64/signal/testcases/za_regs.c      |  5 +++++
 3 files changed, 23 insertions(+)

diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.h b/tools/testing/selftests/arm64/signal/sve_helpers.h
index 50948ce471cc..ca133b93375f 100644
--- a/tools/testing/selftests/arm64/signal/sve_helpers.h
+++ b/tools/testing/selftests/arm64/signal/sve_helpers.h
@@ -18,4 +18,17 @@ extern unsigned int nvls;
 
 int sve_fill_vls(bool use_sme, int min_vls);
 
+static inline uint64_t get_svcr(void)
+{
+	uint64_t val;
+
+	asm volatile (
+		"mrs	%0, S3_3_C4_C2_2\n"
+		: "=r"(val)
+		:
+		: "cc");
+
+	return val;
+}
+
 #endif
diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
index 6dbe48cf8b09..1dbca9afb13c 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
@@ -85,6 +85,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
 	fprintf(stderr, "Got expected size %u and VL %d\n",
 		head->size, ssve->vl);
 
+	if (get_svcr() != 0) {
+		fprintf(stderr, "Unexpected SVCR %lx\n", get_svcr());
+		return 1;
+	}
+
 	return 0;
 }
 
diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
index b9e13f27f1f9..badaead5326a 100644
--- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
@@ -91,6 +91,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
 		return 1;
 	}
 
+	if (get_svcr() != 0) {
+		fprintf(stderr, "Unexpected SVCR %lx\n", get_svcr());
+		return 1;
+	}
+
 	return 0;
 }
 

From c297aa7d3fb6755890b78b483e82c9cf07370d50 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 11 Nov 2024 18:32:58 +0000
Subject: [PATCH 157/168] kselftest/arm64: Enable build of PAC tests with
 LLVM=1

Currently we don't build the PAC selftests when building with LLVM=1 since
we attempt to test for PAC support in the toolchain before we've set up the
build system to point at LLVM in lib.mk, which has to be one of the last
things in the Makefile.

Since all versions of LLVM supported for use with the kernel have PAC
support we can just sidestep the issue by just assuming PAC is there when
doing a LLVM=1 build.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241111-arm64-selftest-pac-clang-v1-1-08599ceee418@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/pauth/Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/testing/selftests/arm64/pauth/Makefile b/tools/testing/selftests/arm64/pauth/Makefile
index 72e290b0b10c..b5a1c80e0ead 100644
--- a/tools/testing/selftests/arm64/pauth/Makefile
+++ b/tools/testing/selftests/arm64/pauth/Makefile
@@ -7,8 +7,14 @@ CC := $(CROSS_COMPILE)gcc
 endif
 
 CFLAGS += -mbranch-protection=pac-ret
+
+# All supported LLVMs have PAC, test for GCC
+ifeq ($(LLVM),1)
+pauth_cc_support := 1
+else
 # check if the compiler supports ARMv8.3 and branch protection with PAuth
 pauth_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.3-a -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi)
+endif
 
 ifeq ($(pauth_cc_support),1)
 TEST_GEN_PROGS := pac

From c0350076c13eac4f1d7f7ab6acd43bb252baef7a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 12 Nov 2024 13:08:14 +0000
Subject: [PATCH 158/168] kselftets/arm64: Use flag bits for features in
 fp-ptrace assembler code

The assembler portions of fp-ptrace are passed feature flags by the C code
indicating which architectural features are supported. Currently these use
an entire register for each flag which is wasteful and gets cumbersome as
new flags are added. Switch to using flag bits in a single register to make
things easier to maintain.

No functional change.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-1-250b57c61254@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../selftests/arm64/fp/fp-ptrace-asm.S        | 32 +++++++++++--------
 tools/testing/selftests/arm64/fp/fp-ptrace.c  | 17 +++++++---
 tools/testing/selftests/arm64/fp/fp-ptrace.h  | 10 ++++++
 3 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
index 7ad59d92d02b..5e7e9c878f2c 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
@@ -15,10 +15,7 @@
 
 // Load and save register values with pauses for ptrace
 //
-// x0 - SVE in use
-// x1 - SME in use
-// x2 - SME2 in use
-// x3 - FA64 supported
+// x0 - HAVE_ flags indicating which features are in use
 
 .globl load_and_save
 load_and_save:
@@ -44,7 +41,7 @@ load_and_save:
 	ldp	q30, q31, [x7, #16 * 30]
 
 	// SME?
-	cbz	x1, check_sve_in
+	tbz	x0, #HAVE_SME_SHIFT, check_sve_in
 
 	adrp	x7, svcr_in
 	ldr	x7, [x7, :lo12:svcr_in]
@@ -64,7 +61,7 @@ load_and_save:
 	bne	1b
 
 	// ZT?
-	cbz	x2, check_sm_in
+	tbz	x0, #HAVE_SME2_SHIFT, check_sm_in
 	adrp	x6, zt_in
 	add	x6, x6, :lo12:zt_in
 	_ldr_zt 6
@@ -72,12 +69,16 @@ load_and_save:
 	// In streaming mode?
 check_sm_in:
 	tbz	x7, #SVCR_SM_SHIFT, check_sve_in
-	mov	x4, x3		// Load FFR if we have FA64
+
+	// Load FFR if we have FA64
+	mov	x4, #0
+	tbz	x0, #HAVE_FA64_SHIFT, load_sve
+	mov	x4, #1
 	b	load_sve
 
 	// SVE?
 check_sve_in:
-	cbz	x0, wait_for_writes
+	tbz	x0, #HAVE_SVE_SHIFT, wait_for_writes
 	mov	x4, #1
 
 load_sve:
@@ -165,8 +166,7 @@ wait_for_writes:
 	stp	q28, q29, [x7, #16 * 28]
 	stp	q30, q31, [x7, #16 * 30]
 
-	// SME?
-	cbz	x1, check_sve_out
+	tbz	x0, #HAVE_SME_SHIFT, check_sve_out
 
 	rdsvl	11, 1
 	adrp	x6, sme_vl_out
@@ -187,7 +187,7 @@ wait_for_writes:
 	bne	1b
 
 	// ZT?
-	cbz	x2, check_sm_out
+	tbz	x0, #HAVE_SME2_SHIFT, check_sm_out
 	adrp	x6, zt_out
 	add	x6, x6, :lo12:zt_out
 	_str_zt 6
@@ -195,12 +195,16 @@ wait_for_writes:
 	// In streaming mode?
 check_sm_out:
 	tbz	x7, #SVCR_SM_SHIFT, check_sve_out
-	mov	x4, x3				// FFR?
+
+	// Do we have FA64 and FFR?
+	mov	x4, #0
+	tbz	x0, #HAVE_FA64_SHIFT, read_sve
+	mov	x4, #1
 	b	read_sve
 
 	// SVE?
 check_sve_out:
-	cbz	x0, wait_for_reads
+	tbz	x0, #HAVE_SVE_SHIFT, wait_for_reads
 	mov	x4, #1
 
 	rdvl	x7, #1
@@ -271,7 +275,7 @@ wait_for_reads:
 	brk #0
 
 	// Ensure we don't leave ourselves in streaming mode
-	cbz	x1, out
+	tbz	x0, #HAVE_SME_SHIFT, out
 	msr	S3_3_C4_C2_2, xzr
 
 out:
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index c7ceafe5f471..d96af27487fa 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -82,7 +82,7 @@ uint64_t sve_vl_out;
 uint64_t sme_vl_out;
 uint64_t svcr_in, svcr_expected, svcr_out;
 
-void load_and_save(int sve, int sme, int sme2, int fa64);
+void load_and_save(int flags);
 
 static bool got_alarm;
 
@@ -198,7 +198,7 @@ static int vl_expected(struct test_config *config)
 
 static void run_child(struct test_config *config)
 {
-	int ret;
+	int ret, flags;
 
 	/* Let the parent attach to us */
 	ret = ptrace(PTRACE_TRACEME, 0, 0, 0);
@@ -224,8 +224,17 @@ static void run_child(struct test_config *config)
 	}
 
 	/* Load values and wait for the parent */
-	load_and_save(sve_supported(), sme_supported(),
-		      sme2_supported(), fa64_supported());
+	flags = 0;
+	if (sve_supported())
+		flags |= HAVE_SVE;
+	if (sme_supported())
+		flags |= HAVE_SME;
+	if (sme2_supported())
+		flags |= HAVE_SME2;
+	if (fa64_supported())
+		flags |= HAVE_FA64;
+
+	load_and_save(flags);
 
 	exit(0);
 }
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h
index db4f2c4d750c..36ca627e1980 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.h
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h
@@ -10,4 +10,14 @@
 #define SVCR_SM (1 << SVCR_SM_SHIFT)
 #define SVCR_ZA (1 << SVCR_ZA_SHIFT)
 
+#define HAVE_SVE_SHIFT		0
+#define HAVE_SME_SHIFT		1
+#define HAVE_SME2_SHIFT		2
+#define HAVE_FA64_SHIFT		3
+
+#define HAVE_SVE	(1 << HAVE_SVE_SHIFT)
+#define HAVE_SME	(1 << HAVE_SME_SHIFT)
+#define HAVE_SME2	(1 << HAVE_SME2_SHIFT)
+#define HAVE_FA64	(1 << HAVE_FA64_SHIFT)
+
 #endif

From 7e9c5b00009a625cc304c865192978c01c0cc077 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 12 Nov 2024 13:08:15 +0000
Subject: [PATCH 159/168] kselftest/arm64: Expand the set of ZA writes
 fp-ptrace does

Currently our test for implementable ZA writes is written in a bit of a
convoluted fashion which excludes all changes where we clear SVCR.SM even
though we can actually support that since changing the vector length resets
SVCR. Make the logic more direct, enabling us to actually run these cases.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-2-250b57c61254@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/fp-ptrace.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index d96af27487fa..56cf6e02c535 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -1078,21 +1078,19 @@ static void sve_write(pid_t child, struct test_config *config)
 
 static bool za_write_supported(struct test_config *config)
 {
-	if (config->svcr_expected & SVCR_SM) {
-		if (!(config->svcr_in & SVCR_SM))
+	if (config->sme_vl_in != config->sme_vl_expected) {
+		/* Changing the SME VL exits streaming mode. */
+		if (config->svcr_expected & SVCR_SM) {
 			return false;
-
-		/* Changing the SME VL exits streaming mode */
-		if (config->sme_vl_in != config->sme_vl_expected) {
+		}
+	} else {
+		/* Otherwise we can't change streaming mode */
+		if ((config->svcr_in & SVCR_SM) !=
+		    (config->svcr_expected & SVCR_SM)) {
 			return false;
 		}
 	}
 
-	/* Can't disable SM outside a VL change */
-	if ((config->svcr_in & SVCR_SM) &&
-	    !(config->svcr_expected & SVCR_SM))
-		return false;
-
 	return true;
 }
 

From 7dbd26d0b22d69d36ab3e76ee7f152482a19cbed Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 12 Nov 2024 13:08:16 +0000
Subject: [PATCH 160/168] kselftest/arm64: Add FPMR coverage to fp-ptrace

Add coverage for FPMR to fp-ptrace. FPMR can be available independently of
SVE and SME, if SME is supported then FPMR is cleared by entering and
exiting streaming mode. As with other registers we generate random values
to load into the register, we restrict these to bitfields which are always
defined. We also leave bitfields where the valid values are affected by
the set of supported FP8 formats zero to reduce complexity, it is unlikely
that specific bitfields will be affected by ptrace issues.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241112-arm64-fp-ptrace-fpmr-v2-3-250b57c61254@kernel.org
[catalin.marinas@arm.com: use REG_FPMR instead of FPMR]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../selftests/arm64/fp/fp-ptrace-asm.S        |  23 +++-
 tools/testing/selftests/arm64/fp/fp-ptrace.c  | 126 ++++++++++++++++++
 tools/testing/selftests/arm64/fp/fp-ptrace.h  |   2 +
 tools/testing/selftests/arm64/fp/sme-inst.h   |   2 +
 4 files changed, 146 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
index 5e7e9c878f2c..82c3ab70e1cf 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
@@ -71,14 +71,12 @@ check_sm_in:
 	tbz	x7, #SVCR_SM_SHIFT, check_sve_in
 
 	// Load FFR if we have FA64
-	mov	x4, #0
-	tbz	x0, #HAVE_FA64_SHIFT, load_sve
-	mov	x4, #1
+	ubfx	x4, x0, #HAVE_FA64_SHIFT, #1
 	b	load_sve
 
 	// SVE?
 check_sve_in:
-	tbz	x0, #HAVE_SVE_SHIFT, wait_for_writes
+	tbz	x0, #HAVE_SVE_SHIFT, check_fpmr_in
 	mov	x4, #1
 
 load_sve:
@@ -143,6 +141,13 @@ load_sve:
 	ldr	p14, [x7, #14, MUL VL]
 	ldr	p15, [x7, #15, MUL VL]
 
+	// This has to come after we set PSTATE.SM
+check_fpmr_in:
+	tbz	x0, #HAVE_FPMR_SHIFT, wait_for_writes
+	adrp	x7, fpmr_in
+	ldr	x7, [x7, :lo12:fpmr_in]
+	msr	REG_FPMR, x7
+
 wait_for_writes:
 	// Wait for the parent
 	brk #0
@@ -166,6 +171,12 @@ wait_for_writes:
 	stp	q28, q29, [x7, #16 * 28]
 	stp	q30, q31, [x7, #16 * 30]
 
+	tbz	x0, #HAVE_FPMR_SHIFT, check_sme_out
+	mrs	x7, REG_FPMR
+	adrp	x6, fpmr_out
+	str	x7, [x6, :lo12:fpmr_out]
+
+check_sme_out:
 	tbz	x0, #HAVE_SME_SHIFT, check_sve_out
 
 	rdsvl	11, 1
@@ -197,9 +208,7 @@ check_sm_out:
 	tbz	x7, #SVCR_SM_SHIFT, check_sve_out
 
 	// Do we have FA64 and FFR?
-	mov	x4, #0
-	tbz	x0, #HAVE_FA64_SHIFT, read_sve
-	mov	x4, #1
+	ubfx	x4, x0, #HAVE_FA64_SHIFT, #1
 	b	read_sve
 
 	// SVE?
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index 56cf6e02c535..4930e03a7b99 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -31,6 +31,14 @@
 
 #include "fp-ptrace.h"
 
+#include <linux/bits.h>
+
+#define FPMR_LSCALE2_MASK                               GENMASK(37, 32)
+#define FPMR_NSCALE_MASK                                GENMASK(31, 24)
+#define FPMR_LSCALE_MASK                                GENMASK(22, 16)
+#define FPMR_OSC_MASK                                   GENMASK(15, 15)
+#define FPMR_OSM_MASK                                   GENMASK(14, 14)
+
 /* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
 #ifndef NT_ARM_SVE
 #define NT_ARM_SVE 0x405
@@ -48,11 +56,22 @@
 #define NT_ARM_ZT 0x40d
 #endif
 
+#ifndef NT_ARM_FPMR
+#define NT_ARM_FPMR 0x40e
+#endif
+
 #define ARCH_VQ_MAX 256
 
 /* VL 128..2048 in powers of 2 */
 #define MAX_NUM_VLS 5
 
+/*
+ * FPMR bits we can set without doing feature checks to see if values
+ * are valid.
+ */
+#define FPMR_SAFE_BITS (FPMR_LSCALE2_MASK | FPMR_NSCALE_MASK | \
+			FPMR_LSCALE_MASK | FPMR_OSC_MASK | FPMR_OSM_MASK)
+
 #define NUM_FPR 32
 __uint128_t v_in[NUM_FPR];
 __uint128_t v_expected[NUM_FPR];
@@ -78,6 +97,8 @@ char zt_in[ZT_SIG_REG_BYTES];
 char zt_expected[ZT_SIG_REG_BYTES];
 char zt_out[ZT_SIG_REG_BYTES];
 
+uint64_t fpmr_in, fpmr_expected, fpmr_out;
+
 uint64_t sve_vl_out;
 uint64_t sme_vl_out;
 uint64_t svcr_in, svcr_expected, svcr_out;
@@ -128,6 +149,11 @@ static bool fa64_supported(void)
 	return getauxval(AT_HWCAP2) & HWCAP2_SME_FA64;
 }
 
+static bool fpmr_supported(void)
+{
+	return getauxval(AT_HWCAP2) & HWCAP2_FPMR;
+}
+
 static bool compare_buffer(const char *name, void *out,
 			   void *expected, size_t size)
 {
@@ -233,6 +259,8 @@ static void run_child(struct test_config *config)
 		flags |= HAVE_SME2;
 	if (fa64_supported())
 		flags |= HAVE_FA64;
+	if (fpmr_supported())
+		flags |= HAVE_FPMR;
 
 	load_and_save(flags);
 
@@ -321,6 +349,14 @@ static void read_child_regs(pid_t child)
 		iov_child.iov_len = sizeof(zt_out);
 		read_one_child_regs(child, "ZT", &iov_parent, &iov_child);
 	}
+
+	if (fpmr_supported()) {
+		iov_parent.iov_base = &fpmr_out;
+		iov_parent.iov_len = sizeof(fpmr_out);
+		iov_child.iov_base = &fpmr_out;
+		iov_child.iov_len = sizeof(fpmr_out);
+		read_one_child_regs(child, "FPMR", &iov_parent, &iov_child);
+	}
 }
 
 static bool continue_breakpoint(pid_t child,
@@ -595,6 +631,26 @@ static bool check_ptrace_values_zt(pid_t child, struct test_config *config)
 	return compare_buffer("initial ZT", buf, zt_in, ZT_SIG_REG_BYTES);
 }
 
+static bool check_ptrace_values_fpmr(pid_t child, struct test_config *config)
+{
+	uint64_t val;
+	struct iovec iov;
+	int ret;
+
+	if (!fpmr_supported())
+		return true;
+
+	iov.iov_base = &val;
+	iov.iov_len = sizeof(val);
+	ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_FPMR, &iov);
+	if (ret != 0) {
+		ksft_print_msg("Failed to read initial FPMR: %s (%d)\n",
+			       strerror(errno), errno);
+		return false;
+	}
+
+	return compare_buffer("initial FPMR", &val, &fpmr_in, sizeof(val));
+}
 
 static bool check_ptrace_values(pid_t child, struct test_config *config)
 {
@@ -629,6 +685,9 @@ static bool check_ptrace_values(pid_t child, struct test_config *config)
 	if (!check_ptrace_values_zt(child, config))
 		pass = false;
 
+	if (!check_ptrace_values_fpmr(child, config))
+		pass = false;
+
 	return pass;
 }
 
@@ -832,11 +891,18 @@ static void set_initial_values(struct test_config *config)
 {
 	int vq = __sve_vq_from_vl(vl_in(config));
 	int sme_vq = __sve_vq_from_vl(config->sme_vl_in);
+	bool sm_change;
 
 	svcr_in = config->svcr_in;
 	svcr_expected = config->svcr_expected;
 	svcr_out = 0;
 
+	if (sme_supported() &&
+	    (svcr_in & SVCR_SM) != (svcr_expected & SVCR_SM))
+		sm_change = true;
+	else
+		sm_change = false;
+
 	fill_random(&v_in, sizeof(v_in));
 	memcpy(v_expected, v_in, sizeof(v_in));
 	memset(v_out, 0, sizeof(v_out));
@@ -883,6 +949,21 @@ static void set_initial_values(struct test_config *config)
 			memset(zt_expected, 0, ZT_SIG_REG_BYTES);
 		memset(zt_out, 0, sizeof(zt_out));
 	}
+
+	if (fpmr_supported()) {
+		fill_random(&fpmr_in, sizeof(fpmr_in));
+		fpmr_in &= FPMR_SAFE_BITS;
+
+		/* Entering or exiting streaming mode clears FPMR */
+		if (sm_change)
+			fpmr_expected = 0;
+		else
+			fpmr_expected = fpmr_in;
+	} else {
+		fpmr_in = 0;
+		fpmr_expected = 0;
+		fpmr_out = 0;
+	}
 }
 
 static bool check_memory_values(struct test_config *config)
@@ -933,6 +1014,12 @@ static bool check_memory_values(struct test_config *config)
 	if (!compare_buffer("saved ZT", zt_out, zt_expected, ZT_SIG_REG_BYTES))
 		pass = false;
 
+	if (fpmr_out != fpmr_expected) {
+		ksft_print_msg("Mismatch in saved FPMR: %lx != %lx\n",
+			       fpmr_out, fpmr_expected);
+		pass = false;
+	}
+
 	return pass;
 }
 
@@ -1010,6 +1097,36 @@ static void fpsimd_write(pid_t child, struct test_config *test_config)
 			       strerror(errno), errno);
 }
 
+static bool fpmr_write_supported(struct test_config *config)
+{
+	if (!fpmr_supported())
+		return false;
+
+	if (!sve_sme_same(config))
+		return false;
+
+	return true;
+}
+
+static void fpmr_write_expected(struct test_config *config)
+{
+	fill_random(&fpmr_expected, sizeof(fpmr_expected));
+	fpmr_expected &= FPMR_SAFE_BITS;
+}
+
+static void fpmr_write(pid_t child, struct test_config *config)
+{
+	struct iovec iov;
+	int ret;
+
+	iov.iov_len = sizeof(fpmr_expected);
+	iov.iov_base = &fpmr_expected;
+	ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_FPMR, &iov);
+	if (ret != 0)
+		ksft_print_msg("Failed to write FPMR: %s (%d)\n",
+			       strerror(errno), errno);
+}
+
 static void sve_write_expected(struct test_config *config)
 {
 	int vl = vl_expected(config);
@@ -1266,6 +1383,12 @@ static struct test_definition base_test_defs[] = {
 		.set_expected_values = fpsimd_write_expected,
 		.modify_values = fpsimd_write,
 	},
+	{
+		.name = "FPMR write",
+		.supported = fpmr_write_supported,
+		.set_expected_values = fpmr_write_expected,
+		.modify_values = fpmr_write,
+	},
 };
 
 static struct test_definition sve_test_defs[] = {
@@ -1475,6 +1598,9 @@ int main(void)
 	if (fa64_supported())
 		ksft_print_msg("FA64 supported\n");
 
+	if (fpmr_supported())
+		ksft_print_msg("FPMR supported\n");
+
 	ksft_set_plan(tests);
 
 	/* Get signal handers ready before we start any children */
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h
index 36ca627e1980..c06919aaf1f7 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.h
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h
@@ -14,10 +14,12 @@
 #define HAVE_SME_SHIFT		1
 #define HAVE_SME2_SHIFT		2
 #define HAVE_FA64_SHIFT		3
+#define HAVE_FPMR_SHIFT		4
 
 #define HAVE_SVE	(1 << HAVE_SVE_SHIFT)
 #define HAVE_SME	(1 << HAVE_SME_SHIFT)
 #define HAVE_SME2	(1 << HAVE_SME2_SHIFT)
 #define HAVE_FA64	(1 << HAVE_FA64_SHIFT)
+#define HAVE_FPMR	(1 << HAVE_FPMR_SHIFT)
 
 #endif
diff --git a/tools/testing/selftests/arm64/fp/sme-inst.h b/tools/testing/selftests/arm64/fp/sme-inst.h
index 9292bba5400b..85b9184e0835 100644
--- a/tools/testing/selftests/arm64/fp/sme-inst.h
+++ b/tools/testing/selftests/arm64/fp/sme-inst.h
@@ -5,6 +5,8 @@
 #ifndef SME_INST_H
 #define SME_INST_H
 
+#define REG_FPMR                                        S3_3_C4_C4_2
+
 /*
  * RDSVL X\nx, #\imm
  */

From 016d659e62ad9ddda1b6899468d0d0798ed71a4d Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 12 Nov 2024 14:35:05 +0000
Subject: [PATCH 161/168] kselftest/arm64: Fix missing printf() argument in
 gcs/gcs-stress.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling the child_cleanup() function results in:

gcs-stress.c: In function ‘child_cleanup’:
gcs-stress.c:266:75: warning: format ‘%d’ expects a matching ‘int’ argument [-Wformat=]
  266 |                                 ksft_print_msg("%s: Exited due to signal %d\n",
      |                                                                          ~^
      |                                                                           |
      |                                                                           int

Add the missing child->exit_signal argument.

Fixes: 05e6cfff58c4 ("kselftest/arm64: Add a GCS stress test")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/gcs/gcs-stress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c
index 03222c36c436..bbc7f4950c13 100644
--- a/tools/testing/selftests/arm64/gcs/gcs-stress.c
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c
@@ -264,7 +264,7 @@ static void child_cleanup(struct child_data *child)
 			if (WIFSIGNALED(status)) {
 				child->exit_signal = WTERMSIG(status);
 				ksft_print_msg("%s: Exited due to signal %d\n",
-					       child->name);
+					       child->name, child->exit_signal);
 				fail = true;
 				child->exited = true;
 			}

From de7fb8d3a2c913e36d78d762e9fbdd754d8dc749 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 11 Nov 2024 13:22:49 +0530
Subject: [PATCH 162/168] arm64/mm: Change protval as 'pteval_t' in map_range()

pgprot_t has been defined as an encapsulated structure with pteval_t as its
element. Hence it is prudent to use pteval_t as the type instead of via the
size based u64. Besides pteval_t type might be different size later on with
FEAT_D128.

Cc: Will Deacon <will@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Link: https://lore.kernel.org/r/20241111075249.609493-1-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/pi/map_range.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c
index 5410b2cac590..2b69e3beeef8 100644
--- a/arch/arm64/kernel/pi/map_range.c
+++ b/arch/arm64/kernel/pi/map_range.c
@@ -30,7 +30,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
 		      int level, pte_t *tbl, bool may_use_cont, u64 va_offset)
 {
 	u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX;
-	u64 protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
+	pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
 	int lshift = (3 - level) * (PAGE_SHIFT - 3);
 	u64 lmask = (PAGE_SIZE << lshift) - 1;
 

From f95382d73ec8bfee2b4a00b2ac4aa4530f7c4af3 Mon Sep 17 00:00:00 2001
From: Min-Hua Chen <minhuadotchen@gmail.com>
Date: Wed, 18 Sep 2024 07:38:24 +0800
Subject: [PATCH 163/168] acpi/arm64: remove unnecessary cast

DEFINE_RES_IRQ returns struct resource type, so it is
unnecessary to cast it to struct resource.

Remove the unnecessary cast to fix the following sparse warnings:

drivers/acpi/arm64/gtdt.c:355:19: sparse: warning: cast to non-scalar
drivers/acpi/arm64/gtdt.c:355:19: sparse: warning: cast from non-scalar

No functional changes intended.

Signed-off-by: Min-Hua Chen <minhuadotchen@gmail.com>
Acked-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240917233827.73167-1-minhuadotchen@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/gtdt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index 81dc0582743d..3561553eff8b 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -363,7 +363,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
 	}
 
 	irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags);
-	res[2] = (struct resource)DEFINE_RES_IRQ(irq);
+	res[2] = DEFINE_RES_IRQ(irq);
 	if (irq <= 0) {
 		pr_warn("failed to map the Watchdog interrupt.\n");
 		nr_res--;

From 3e360ef0c0a1fb6ce9a302e40b8057c41ba8a9d2 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 7 Nov 2024 01:39:22 +0000
Subject: [PATCH 164/168] kselftest/arm64: Corrupt P0 in the irritator when
 testing SSVE

When building for streaming SVE the irritator for SVE skips updates of both
P0 and FFR. While FFR is skipped since it might not be present there is no
reason to skip corrupting P0 so switch to an instruction valid in streaming
mode and move the ifdef.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241107-arm64-fp-stress-irritator-v2-3-c4b9622e36ee@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-test.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index f1fb9745c681..28eb8b5cc2d2 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -302,9 +302,9 @@ function irritator_handler
 	movi	v0.8b, #1
 	movi	v9.16b, #2
 	movi	v31.8b, #3
-#ifndef SSVE
 	// And P0
-	rdffr	p0.b
+	ptrue	p0.d
+#ifndef SSVE
 	// And FFR
 	wrffr	p15.b
 #endif

From c0139f6cbb1fc6438509467ec0455a200cc49a43 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 6 Nov 2024 17:41:32 +0000
Subject: [PATCH 165/168] arm64/ptrace: Clarify documentation of VL
 configuration via ptrace

When we configure SVE, SSVE or ZA via ptrace we allow the user to configure
the vector length and specify any of the flags that are accepted when
configuring via prctl(). This includes the S[VM]E_SET_VL_ONEXEC flag which
defers the configuration of the VL until an exec(). We don't do anything to
limit the provision of register data as part of configuring the _ONEXEC VL
but as a function of the VL enumeration support we do this will be
interpreted using the vector length currently configured for the process.

This is all a bit surprising, and probably we should just not have allowed
register data to be specified with _ONEXEC, but it's our ABI so let's
add some explicit documentation in both the ABI documents and the source
calling out what happens.

The comments are also missing the fact that since SME does not have a
mandatory 128 bit VL it is possible for VL enumeration to result in the
configuration of a higher VL than was requested, cover that too.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241106-arm64-sve-ptrace-vl-set-v1-1-3b164e8b559c@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/sme.rst |  4 ++++
 Documentation/arch/arm64/sve.rst |  4 ++++
 arch/arm64/kernel/ptrace.c       | 12 ++++++++++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst
index be317d457417..b2fa01f85cb5 100644
--- a/Documentation/arch/arm64/sme.rst
+++ b/Documentation/arch/arm64/sme.rst
@@ -346,6 +346,10 @@ The regset data starts with struct user_za_header, containing:
 
 * Writes to NT_ARM_ZT will set PSTATE.ZA to 1.
 
+* If any register data is provided along with SME_PT_VL_ONEXEC then the
+  registers data will be interpreted with the current vector length, not
+  the vector length configured for use on exec.
+
 
 8.  ELF coredump extensions
 ---------------------------
diff --git a/Documentation/arch/arm64/sve.rst b/Documentation/arch/arm64/sve.rst
index 8d8837fc39ec..28152492c29c 100644
--- a/Documentation/arch/arm64/sve.rst
+++ b/Documentation/arch/arm64/sve.rst
@@ -402,6 +402,10 @@ The regset data starts with struct user_sve_header, containing:
   streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode
   if the target was not in streaming mode.
 
+* If any register data is provided along with SVE_PT_VL_ONEXEC then the
+  registers data will be interpreted with the current vector length, not
+  the vector length configured for use on exec.
+
 * The effect of writing a partial, incomplete payload is unspecified.
 
 
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b756578aeaee..f09ffd70c916 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -898,7 +898,11 @@ static int sve_set_common(struct task_struct *target,
 	if (ret)
 		goto out;
 
-	/* Actual VL set may be less than the user asked for: */
+	/*
+	 * Actual VL set may be different from what the user asked
+	 * for, or we may have configured the _ONEXEC VL not the
+	 * current VL:
+	 */
 	vq = sve_vq_from_vl(task_get_vl(target, type));
 
 	/* Enter/exit streaming mode */
@@ -1125,7 +1129,11 @@ static int za_set(struct task_struct *target,
 	if (ret)
 		goto out;
 
-	/* Actual VL set may be less than the user asked for: */
+	/*
+	 * Actual VL set may be different from what the user asked
+	 * for, or we may have configured the _ONEXEC rather than
+	 * current VL:
+	 */
 	vq = sve_vq_from_vl(task_get_sme_vl(target));
 
 	/* Ensure there is some SVE storage for streaming mode */

From 27141b690547da5650a420f26ec369ba142a9ebb Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 11 Nov 2024 16:18:55 +0000
Subject: [PATCH 166/168] kselftest/arm64: Don't leak pipe fds in
 pac.exec_sign_all()

The PAC exec_sign_all() test spawns some child processes, creating pipes
to be stdin and stdout for the child. It cleans up most of the file
descriptors that are created as part of this but neglects to clean up the
parent end of the child stdin and stdout. Add the missing close() calls.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241111-arm64-pac-test-collisions-v1-1-171875f37e44@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/pauth/pac.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c
index b743daa772f5..5a07b3958fbf 100644
--- a/tools/testing/selftests/arm64/pauth/pac.c
+++ b/tools/testing/selftests/arm64/pauth/pac.c
@@ -182,6 +182,9 @@ int exec_sign_all(struct signatures *signed_vals, size_t val)
 		return -1;
 	}
 
+	close(new_stdin[1]);
+	close(new_stdout[0]);
+
 	return 0;
 }
 

From 91a6533811bb81139c5a44d039b9b0a6af238bc8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 11 Nov 2024 16:18:56 +0000
Subject: [PATCH 167/168] kselftest/arm64: Try harder to generate different
 keys during PAC tests

We very intermittently see failures in the single_thread_different_keys
PAC test. As noted in the comment in the test the PAC field can be quite
narrow so there is a chance of collisions even with different keys with a
chance of 5% for 7 bit keys, and the potential for narrower keys. The test
tries to avoid this by running repeatedly, but only tries 10 times which
even with a 5% chance of collisions isn't enough.

Increase the number of times we attempt to look for collisions by a factor
of 100, this also affects other tests which are following a similar pattern
with running the test repeatedly and either don't care like with
pac_instruction_not_nop or potentially have the same issue like
exec_sign_all.

The PAC tests are very fast, running in a second or two even in emulation,
so the 100x increased cost is mildly irritating but not a huge issue. The
bulk of the overhead is in the exec_sign_all test which does a fork() and
exec() per iteration.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20241111-arm64-pac-test-collisions-v1-2-171875f37e44@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/pauth/pac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c
index 5a07b3958fbf..6d21b2fc758d 100644
--- a/tools/testing/selftests/arm64/pauth/pac.c
+++ b/tools/testing/selftests/arm64/pauth/pac.c
@@ -13,7 +13,7 @@
 #include "../../kselftest_harness.h"
 #include "helper.h"
 
-#define PAC_COLLISION_ATTEMPTS 10
+#define PAC_COLLISION_ATTEMPTS 1000
 /*
  * The kernel sets TBID by default. So bits 55 and above should remain
  * untouched no matter what.

From 67ab51cbdfee02ef07fb9d7d14cc0bf6cb5a5e5c Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 14 Nov 2024 09:53:32 +0000
Subject: [PATCH 168/168] arm64: tls: Fix context-switching of tpidrro_el0 when
 kpti is enabled

Commit 18011eac28c7 ("arm64: tls: Avoid unconditional zeroing of
tpidrro_el0 for native tasks") tried to optimise the context switching
of tpidrro_el0 by eliding the clearing of the register when switching
to a native task with kpti enabled, on the erroneous assumption that
the kpti trampoline entry code would already have taken care of the
write.

Although the kpti trampoline does zero the register on entry from a
native task, the check in tls_thread_switch() is on the *next* task and
so we can end up leaving a stale, non-zero value in the register if the
previous task was 32-bit.

Drop the broken optimisation and zero tpidrro_el0 unconditionally when
switching to a native 64-bit task.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: stable@vger.kernel.org
Fixes: 18011eac28c7 ("arm64: tls: Avoid unconditional zeroing of tpidrro_el0 for native tasks")
Signed-off-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20241114095332.23391-1-will@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 0540653fbf38..3d78798c4a0a 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -439,7 +439,7 @@ static void tls_thread_switch(struct task_struct *next)
 
 	if (is_compat_thread(task_thread_info(next)))
 		write_sysreg(next->thread.uw.tp_value, tpidrro_el0);
-	else if (!arm64_kernel_unmapped_at_el0())
+	else
 		write_sysreg(0, tpidrro_el0);
 
 	write_sysreg(*task_user_tls(next), tpidr_el0);