From 9af3e08baa7c20ee69b7fc18e720e08a091493b9 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:09 +0100
Subject: [PATCH 01/87] KVM: arm64: Remove kvm_mmu_free_memory_caches()

kvm_mmu_free_memory_caches() is only called by kvm_arch_vcpu_destroy(),
so inline the implementation and get rid of the extra function.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-2-will@kernel.org
---
 arch/arm64/include/asm/kvm_mmu.h | 2 --
 arch/arm64/kvm/arm.c             | 2 +-
 arch/arm64/kvm/mmu.c             | 5 -----
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 189839c3706a..0f078b1920ff 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -141,8 +141,6 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
-
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 int kvm_mmu_init(void);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46dc3d75cf13..262a0afbcc27 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -283,7 +283,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm)))
 		static_branch_dec(&userspace_irqchip_in_use);
 
-	kvm_mmu_free_memory_caches(vcpu);
+	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 	kvm_timer_vcpu_terminate(vcpu);
 	kvm_pmu_vcpu_destroy(vcpu);
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index ba00bcc0c884..935f8f689433 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -2324,11 +2324,6 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 				 kvm_test_age_hva_handler, NULL);
 }
 
-void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
-{
-	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
-}
-
 phys_addr_t kvm_mmu_get_httbr(void)
 {
 	if (__kvm_cpu_uses_extended_idmap())

From b1e57de62cfb4d05f45ab848bb893fbcff9557d3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:10 +0100
Subject: [PATCH 02/87] KVM: arm64: Add stand-alone page-table walker
 infrastructure

The KVM page-table code is intricately tied into the kernel page-table
code and re-uses the pte/pmd/pud/p4d/pgd macros directly in an attempt
to reduce code duplication. Unfortunately, the reality is that there is
an awful lot of code required to make this work, and at the end of the
day you're limited to creating page-tables with the same configuration
as the host kernel. Furthermore, lifting the page-table code to run
directly at EL2 on a non-VHE system (as we plan to to do in future
patches) is practically impossible due to the number of dependencies it
has on the core kernel.

Introduce a framework for walking Armv8 page-tables configured
independently from the host kernel.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-3-will@kernel.org
---
 arch/arm64/include/asm/kvm_pgtable.h | 104 ++++++++++
 arch/arm64/kvm/hyp/Makefile          |   2 +-
 arch/arm64/kvm/hyp/pgtable.c         | 285 +++++++++++++++++++++++++++
 3 files changed, 390 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/kvm_pgtable.h
 create mode 100644 arch/arm64/kvm/hyp/pgtable.c

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
new file mode 100644
index 000000000000..9423e5d609f1
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ARM64_KVM_PGTABLE_H__
+#define __ARM64_KVM_PGTABLE_H__
+
+#include <linux/bits.h>
+#include <linux/kvm_host.h>
+#include <linux/types.h>
+
+typedef u64 kvm_pte_t;
+
+/**
+ * struct kvm_pgtable - KVM page-table.
+ * @ia_bits:		Maximum input address size, in bits.
+ * @start_level:	Level at which the page-table walk starts.
+ * @pgd:		Pointer to the first top-level entry of the page-table.
+ * @mmu:		Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
+ */
+struct kvm_pgtable {
+	u32					ia_bits;
+	u32					start_level;
+	kvm_pte_t				*pgd;
+
+	/* Stage-2 only */
+	struct kvm_s2_mmu			*mmu;
+};
+
+/**
+ * enum kvm_pgtable_prot - Page-table permissions and attributes.
+ * @KVM_PGTABLE_PROT_X:		Execute permission.
+ * @KVM_PGTABLE_PROT_W:		Write permission.
+ * @KVM_PGTABLE_PROT_R:		Read permission.
+ * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
+ */
+enum kvm_pgtable_prot {
+	KVM_PGTABLE_PROT_X			= BIT(0),
+	KVM_PGTABLE_PROT_W			= BIT(1),
+	KVM_PGTABLE_PROT_R			= BIT(2),
+
+	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
+};
+
+/**
+ * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
+ * @KVM_PGTABLE_WALK_LEAF:		Visit leaf entries, including invalid
+ *					entries.
+ * @KVM_PGTABLE_WALK_TABLE_PRE:		Visit table entries before their
+ *					children.
+ * @KVM_PGTABLE_WALK_TABLE_POST:	Visit table entries after their
+ *					children.
+ */
+enum kvm_pgtable_walk_flags {
+	KVM_PGTABLE_WALK_LEAF			= BIT(0),
+	KVM_PGTABLE_WALK_TABLE_PRE		= BIT(1),
+	KVM_PGTABLE_WALK_TABLE_POST		= BIT(2),
+};
+
+typedef int (*kvm_pgtable_visitor_fn_t)(u64 addr, u64 end, u32 level,
+					kvm_pte_t *ptep,
+					enum kvm_pgtable_walk_flags flag,
+					void * const arg);
+
+/**
+ * struct kvm_pgtable_walker - Hook into a page-table walk.
+ * @cb:		Callback function to invoke during the walk.
+ * @arg:	Argument passed to the callback function.
+ * @flags:	Bitwise-OR of flags to identify the entry types on which to
+ *		invoke the callback function.
+ */
+struct kvm_pgtable_walker {
+	const kvm_pgtable_visitor_fn_t		cb;
+	void * const				arg;
+	const enum kvm_pgtable_walk_flags	flags;
+};
+
+/**
+ * kvm_pgtable_walk() - Walk a page-table.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
+ * @addr:	Input address for the start of the walk.
+ * @size:	Size of the range to walk.
+ * @walker:	Walker callback description.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * The walker will walk the page-table entries corresponding to the input
+ * address range specified, visiting entries according to the walker flags.
+ * Invalid entries are treated as leaf entries. Leaf entries are reloaded
+ * after invoking the walker callback, allowing the walker to descend into
+ * a newly installed table.
+ *
+ * Returning a negative error code from the walker callback function will
+ * terminate the walk immediately with the same error code.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+		     struct kvm_pgtable_walker *walker);
+
+#endif	/* __ARM64_KVM_PGTABLE_H__ */
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index f54f0e89a71c..607b8a898826 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,5 +10,5 @@ subdir-ccflags-y := -I$(incdir)				\
 		    -DDISABLE_BRANCH_PROFILING		\
 		    $(DISABLE_STACKLEAK_PLUGIN)
 
-obj-$(CONFIG_KVM) += vhe/ nvhe/
+obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o
 obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
new file mode 100644
index 000000000000..3fb9d1949a3f
--- /dev/null
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
+ * No bombay mix was harmed in the writing of this file.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#include <linux/bitfield.h>
+#include <asm/kvm_pgtable.h>
+
+#define KVM_PGTABLE_MAX_LEVELS		4U
+
+#define KVM_PTE_VALID			BIT(0)
+
+#define KVM_PTE_TYPE			BIT(1)
+#define KVM_PTE_TYPE_BLOCK		0
+#define KVM_PTE_TYPE_PAGE		1
+#define KVM_PTE_TYPE_TABLE		1
+
+#define KVM_PTE_ADDR_MASK		GENMASK(47, PAGE_SHIFT)
+#define KVM_PTE_ADDR_51_48		GENMASK(15, 12)
+
+#define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
+
+#define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
+
+struct kvm_pgtable_walk_data {
+	struct kvm_pgtable		*pgt;
+	struct kvm_pgtable_walker	*walker;
+
+	u64				addr;
+	u64				end;
+};
+
+static u64 kvm_granule_shift(u32 level)
+{
+	/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
+	return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
+}
+
+static u64 kvm_granule_size(u32 level)
+{
+	return BIT(kvm_granule_shift(level));
+}
+
+static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level)
+{
+	u64 granule = kvm_granule_size(level);
+
+	/*
+	 * Reject invalid block mappings and don't bother with 4TB mappings for
+	 * 52-bit PAs.
+	 */
+	if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1))
+		return false;
+
+	if (granule > (end - addr))
+		return false;
+
+	return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule);
+}
+
+static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level)
+{
+	u64 shift = kvm_granule_shift(level);
+	u64 mask = BIT(PAGE_SHIFT - 3) - 1;
+
+	return (data->addr >> shift) & mask;
+}
+
+static u32 __kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
+{
+	u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
+	u64 mask = BIT(pgt->ia_bits) - 1;
+
+	return (addr & mask) >> shift;
+}
+
+static u32 kvm_pgd_page_idx(struct kvm_pgtable_walk_data *data)
+{
+	return __kvm_pgd_page_idx(data->pgt, data->addr);
+}
+
+static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level)
+{
+	struct kvm_pgtable pgt = {
+		.ia_bits	= ia_bits,
+		.start_level	= start_level,
+	};
+
+	return __kvm_pgd_page_idx(&pgt, -1ULL) + 1;
+}
+
+static bool kvm_pte_valid(kvm_pte_t pte)
+{
+	return pte & KVM_PTE_VALID;
+}
+
+static bool kvm_pte_table(kvm_pte_t pte, u32 level)
+{
+	if (level == KVM_PGTABLE_MAX_LEVELS - 1)
+		return false;
+
+	if (!kvm_pte_valid(pte))
+		return false;
+
+	return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
+static u64 kvm_pte_to_phys(kvm_pte_t pte)
+{
+	u64 pa = pte & KVM_PTE_ADDR_MASK;
+
+	if (PAGE_SHIFT == 16)
+		pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
+
+	return pa;
+}
+
+static kvm_pte_t kvm_phys_to_pte(u64 pa)
+{
+	kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK;
+
+	if (PAGE_SHIFT == 16)
+		pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
+
+	return pte;
+}
+
+static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte)
+{
+	return __va(kvm_pte_to_phys(pte));
+}
+
+static void kvm_set_invalid_pte(kvm_pte_t *ptep)
+{
+	kvm_pte_t pte = *ptep;
+	WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID);
+}
+
+static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp)
+{
+	kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp));
+
+	pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
+	pte |= KVM_PTE_VALID;
+
+	WARN_ON(kvm_pte_valid(old));
+	smp_store_release(ptep, pte);
+}
+
+static bool kvm_set_valid_leaf_pte(kvm_pte_t *ptep, u64 pa, kvm_pte_t attr,
+				   u32 level)
+{
+	kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(pa);
+	u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE :
+							   KVM_PTE_TYPE_BLOCK;
+
+	pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
+	pte |= FIELD_PREP(KVM_PTE_TYPE, type);
+	pte |= KVM_PTE_VALID;
+
+	/* Tolerate KVM recreating the exact same mapping. */
+	if (kvm_pte_valid(old))
+		return old == pte;
+
+	smp_store_release(ptep, pte);
+	return true;
+}
+
+static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr,
+				  u32 level, kvm_pte_t *ptep,
+				  enum kvm_pgtable_walk_flags flag)
+{
+	struct kvm_pgtable_walker *walker = data->walker;
+	return walker->cb(addr, data->end, level, ptep, flag, walker->arg);
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+			      kvm_pte_t *pgtable, u32 level);
+
+static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
+				      kvm_pte_t *ptep, u32 level)
+{
+	int ret = 0;
+	u64 addr = data->addr;
+	kvm_pte_t *childp, pte = *ptep;
+	bool table = kvm_pte_table(pte, level);
+	enum kvm_pgtable_walk_flags flags = data->walker->flags;
+
+	if (table && (flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
+		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+					     KVM_PGTABLE_WALK_TABLE_PRE);
+	}
+
+	if (!table && (flags & KVM_PGTABLE_WALK_LEAF)) {
+		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+					     KVM_PGTABLE_WALK_LEAF);
+		pte = *ptep;
+		table = kvm_pte_table(pte, level);
+	}
+
+	if (ret)
+		goto out;
+
+	if (!table) {
+		data->addr += kvm_granule_size(level);
+		goto out;
+	}
+
+	childp = kvm_pte_follow(pte);
+	ret = __kvm_pgtable_walk(data, childp, level + 1);
+	if (ret)
+		goto out;
+
+	if (flags & KVM_PGTABLE_WALK_TABLE_POST) {
+		ret = kvm_pgtable_visitor_cb(data, addr, level, ptep,
+					     KVM_PGTABLE_WALK_TABLE_POST);
+	}
+
+out:
+	return ret;
+}
+
+static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
+			      kvm_pte_t *pgtable, u32 level)
+{
+	u32 idx;
+	int ret = 0;
+
+	if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS))
+		return -EINVAL;
+
+	for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
+		kvm_pte_t *ptep = &pgtable[idx];
+
+		if (data->addr >= data->end)
+			break;
+
+		ret = __kvm_pgtable_visit(data, ptep, level);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+static int _kvm_pgtable_walk(struct kvm_pgtable_walk_data *data)
+{
+	u32 idx;
+	int ret = 0;
+	struct kvm_pgtable *pgt = data->pgt;
+	u64 limit = BIT(pgt->ia_bits);
+
+	if (data->addr > limit || data->end > limit)
+		return -ERANGE;
+
+	if (!pgt->pgd)
+		return -EINVAL;
+
+	for (idx = kvm_pgd_page_idx(data); data->addr < data->end; ++idx) {
+		kvm_pte_t *ptep = &pgt->pgd[idx * PTRS_PER_PTE];
+
+		ret = __kvm_pgtable_walk(data, ptep, pgt->start_level);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
+		     struct kvm_pgtable_walker *walker)
+{
+	struct kvm_pgtable_walk_data walk_data = {
+		.pgt	= pgt,
+		.addr	= ALIGN_DOWN(addr, PAGE_SIZE),
+		.end	= PAGE_ALIGN(walk_data.addr + size),
+		.walker	= walker,
+	};
+
+	return _kvm_pgtable_walk(&walk_data);
+}

From bb0e92cbbcd1f91411d0a7241f8febe2c8c5b078 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:11 +0100
Subject: [PATCH 03/87] KVM: arm64: Add support for creating kernel-agnostic
 stage-1 page tables

The generic page-table walker is pretty useless as it stands, because it
doesn't understand enough to allocate anything. Teach it about stage-1
page-tables, and hook up an API for allocating these for the hypervisor
at EL2.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-4-will@kernel.org
---
 arch/arm64/include/asm/kvm_pgtable.h |  40 ++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 133 +++++++++++++++++++++++++++
 2 files changed, 173 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 9423e5d609f1..d0a207d095e6 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -77,6 +77,46 @@ struct kvm_pgtable_walker {
 	const enum kvm_pgtable_walk_flags	flags;
 };
 
+/**
+ * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
+ * @pgt:	Uninitialised page-table structure to initialise.
+ * @va_bits:	Maximum virtual address bits.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits);
+
+/**
+ * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_hyp_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
+
+/**
+ * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_hyp_init().
+ * @addr:	Virtual address at which to place the mapping.
+ * @size:	Size of the mapping.
+ * @phys:	Physical address of the memory to map.
+ * @prot:	Permissions and attributes for the mapping.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable. Attempts to install a new mapping
+ * for a virtual address that is already mapped will be rejected with an
+ * error and a WARN().
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+			enum kvm_pgtable_prot prot);
+
 /**
  * kvm_pgtable_walk() - Walk a page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 3fb9d1949a3f..23a1006aa4ef 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -24,8 +24,18 @@
 
 #define KVM_PTE_LEAF_ATTR_LO		GENMASK(11, 2)
 
+#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX	GENMASK(4, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP	GENMASK(7, 6)
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO	3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW	1
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH	GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
+#define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
+
 #define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
 
+#define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
+
 struct kvm_pgtable_walk_data {
 	struct kvm_pgtable		*pgt;
 	struct kvm_pgtable_walker	*walker;
@@ -283,3 +293,126 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
 
 	return _kvm_pgtable_walk(&walk_data);
 }
+
+struct hyp_map_data {
+	u64		phys;
+	kvm_pte_t	attr;
+};
+
+static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot,
+				 struct hyp_map_data *data)
+{
+	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+	u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
+	kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
+	u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
+	u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
+					       KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;
+
+	if (!(prot & KVM_PGTABLE_PROT_R))
+		return -EINVAL;
+
+	if (prot & KVM_PGTABLE_PROT_X) {
+		if (prot & KVM_PGTABLE_PROT_W)
+			return -EINVAL;
+
+		if (device)
+			return -EINVAL;
+	} else {
+		attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
+	}
+
+	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
+	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
+	attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
+	data->attr = attr;
+	return 0;
+}
+
+static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+				    kvm_pte_t *ptep, struct hyp_map_data *data)
+{
+	u64 granule = kvm_granule_size(level), phys = data->phys;
+
+	if (!kvm_block_mapping_supported(addr, end, phys, level))
+		return false;
+
+	WARN_ON(!kvm_set_valid_leaf_pte(ptep, phys, data->attr, level));
+	data->phys += granule;
+	return true;
+}
+
+static int hyp_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			  enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+	kvm_pte_t *childp;
+
+	if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg))
+		return 0;
+
+	if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+		return -EINVAL;
+
+	childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+	if (!childp)
+		return -ENOMEM;
+
+	kvm_set_table_pte(ptep, childp);
+	return 0;
+}
+
+int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
+			enum kvm_pgtable_prot prot)
+{
+	int ret;
+	struct hyp_map_data map_data = {
+		.phys	= ALIGN_DOWN(phys, PAGE_SIZE),
+	};
+	struct kvm_pgtable_walker walker = {
+		.cb	= hyp_map_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.arg	= &map_data,
+	};
+
+	ret = hyp_map_set_prot_attr(prot, &map_data);
+	if (ret)
+		return ret;
+
+	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+	dsb(ishst);
+	isb();
+	return ret;
+}
+
+int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits)
+{
+	u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits);
+
+	pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL);
+	if (!pgt->pgd)
+		return -ENOMEM;
+
+	pgt->ia_bits		= va_bits;
+	pgt->start_level	= KVM_PGTABLE_MAX_LEVELS - levels;
+	pgt->mmu		= NULL;
+	return 0;
+}
+
+static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			   enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+	free_page((unsigned long)kvm_pte_follow(*ptep));
+	return 0;
+}
+
+void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= hyp_free_walker,
+		.flags	= KVM_PGTABLE_WALK_TABLE_POST,
+	};
+
+	WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+	free_page((unsigned long)pgt->pgd);
+	pgt->pgd = NULL;
+}

From 0f9d09b8e29bc8166f6584279aedc4a7a4038f68 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:12 +0100
Subject: [PATCH 04/87] KVM: arm64: Use generic allocator for hyp stage-1
 page-tables

Now that we have a shiny new page-table allocator, replace the hyp
page-table code with calls into the new API. This also allows us to
remove the extended idmap code, as we can now simply ensure that the
VA size is large enough to map everything we need.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-5-will@kernel.org
---
 arch/arm64/include/asm/kvm_mmu.h       |  78 +----
 arch/arm64/include/asm/kvm_pgtable.h   |   5 +
 arch/arm64/include/asm/pgtable-hwdef.h |   6 -
 arch/arm64/include/asm/pgtable-prot.h  |   6 -
 arch/arm64/kvm/mmu.c                   | 414 +++----------------------
 5 files changed, 45 insertions(+), 464 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 0f078b1920ff..42fb50cfe0d8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -43,16 +43,6 @@
  *	HYP_VA_MIN = 1 << (VA_BITS - 1)
  * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
  *
- * This of course assumes that the trampoline page exists within the
- * VA_BITS range. If it doesn't, then it means we're in the odd case
- * where the kernel idmap (as well as HYP) uses more levels than the
- * kernel runtime page tables (as seen when the kernel is configured
- * for 4k pages, 39bits VA, and yet memory lives just above that
- * limit, forcing the idmap to use 4 levels of page tables while the
- * kernel itself only uses 3). In this particular case, it doesn't
- * matter which side of VA_BITS we use, as we're guaranteed not to
- * conflict with anything.
- *
  * When using VHE, there are no separate hyp mappings and all KVM
  * functionality is already mapped as part of the main kernel
  * mappings, and none of this applies in that case.
@@ -123,9 +113,10 @@ static inline bool kvm_page_empty(void *ptr)
 	return page_count(ptr_page) == 1;
 }
 
+#include <asm/kvm_pgtable.h>
 #include <asm/stage2_pgtable.h>
 
-int create_hyp_mappings(void *from, void *to, pgprot_t prot);
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
 int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
 			   void __iomem **kaddr,
 			   void __iomem **haddr);
@@ -144,8 +135,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 int kvm_mmu_init(void);
-void kvm_clear_hyp_idmap(void);
-
 #define kvm_mk_pmd(ptep)					\
 	__pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
 #define kvm_mk_pud(pmdp)					\
@@ -263,25 +252,6 @@ static inline bool kvm_s2pud_young(pud_t pud)
 	return pud_young(pud);
 }
 
-#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
-
-#ifdef __PAGETABLE_PMD_FOLDED
-#define hyp_pmd_table_empty(pmdp) (0)
-#else
-#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
-#endif
-
-#ifdef __PAGETABLE_PUD_FOLDED
-#define hyp_pud_table_empty(pudp) (0)
-#else
-#define hyp_pud_table_empty(pudp) kvm_page_empty(pudp)
-#endif
-
-#ifdef __PAGETABLE_P4D_FOLDED
-#define hyp_p4d_table_empty(p4dp) (0)
-#else
-#define hyp_p4d_table_empty(p4dp) kvm_page_empty(p4dp)
-#endif
 
 struct kvm;
 
@@ -350,50 +320,6 @@ static inline void __kvm_flush_dcache_pud(pud_t pud)
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
-static inline bool __kvm_cpu_uses_extended_idmap(void)
-{
-	return __cpu_uses_extended_idmap_level();
-}
-
-static inline unsigned long __kvm_idmap_ptrs_per_pgd(void)
-{
-	return idmap_ptrs_per_pgd;
-}
-
-/*
- * Can't use pgd_populate here, because the extended idmap adds an extra level
- * above CONFIG_PGTABLE_LEVELS (which is 2 or 3 if we're using the extended
- * idmap), and pgd_populate is only available if CONFIG_PGTABLE_LEVELS = 4.
- */
-static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd,
-				       pgd_t *hyp_pgd,
-				       pgd_t *merged_hyp_pgd,
-				       unsigned long hyp_idmap_start)
-{
-	int idmap_idx;
-	u64 pgd_addr;
-
-	/*
-	 * Use the first entry to access the HYP mappings. It is
-	 * guaranteed to be free, otherwise we wouldn't use an
-	 * extended idmap.
-	 */
-	VM_BUG_ON(pgd_val(merged_hyp_pgd[0]));
-	pgd_addr = __phys_to_pgd_val(__pa(hyp_pgd));
-	merged_hyp_pgd[0] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-
-	/*
-	 * Create another extended level entry that points to the boot HYP map,
-	 * which contains an ID mapping of the HYP init code. We essentially
-	 * merge the boot and runtime HYP maps by doing so, but they don't
-	 * overlap anyway, so this is fine.
-	 */
-	idmap_idx = hyp_idmap_start >> VA_BITS;
-	VM_BUG_ON(pgd_val(merged_hyp_pgd[idmap_idx]));
-	pgd_addr = __phys_to_pgd_val(__pa(boot_hyp_pgd));
-	merged_hyp_pgd[idmap_idx] = __pgd(pgd_addr | PMD_TYPE_TABLE);
-}
-
 static inline unsigned int kvm_get_vmid_bits(void)
 {
 	int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index d0a207d095e6..ff5d7d27eb39 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -44,6 +44,11 @@ enum kvm_pgtable_prot {
 	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
 };
 
+#define PAGE_HYP		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
+#define PAGE_HYP_EXEC		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
+#define PAGE_HYP_RO		(KVM_PGTABLE_PROT_R)
+#define PAGE_HYP_DEVICE		(PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)
+
 /**
  * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
  * @KVM_PGTABLE_WALK_LEAF:		Visit leaf entries, including invalid
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index d400a4d9aee2..1a989353144e 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -194,12 +194,6 @@
  */
 #define PTE_S2_MEMATTR(t)	(_AT(pteval_t, (t)) << 2)
 
-/*
- * EL2/HYP PTE/PMD definitions
- */
-#define PMD_HYP			PMD_SECT_USER
-#define PTE_HYP			PTE_USER
-
 /*
  * Highest possible physical address supported.
  */
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 4d867c6446c4..88acd7e1cd05 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -56,7 +56,6 @@ extern bool arm64_use_ng_mappings;
 #define PROT_SECT_NORMAL_EXEC	(PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))
 
 #define _PAGE_DEFAULT		(_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
-#define _HYP_PAGE_DEFAULT	_PAGE_DEFAULT
 
 #define PAGE_KERNEL		__pgprot(PROT_NORMAL)
 #define PAGE_KERNEL_RO		__pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
@@ -64,11 +63,6 @@ extern bool arm64_use_ng_mappings;
 #define PAGE_KERNEL_EXEC	__pgprot(PROT_NORMAL & ~PTE_PXN)
 #define PAGE_KERNEL_EXEC_CONT	__pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
 
-#define PAGE_HYP		__pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
-#define PAGE_HYP_EXEC		__pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
-#define PAGE_HYP_RO		__pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
-#define PAGE_HYP_DEVICE		__pgprot(_PROT_DEFAULT | PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_HYP | PTE_HYP_XN)
-
 #define PAGE_S2_MEMATTR(attr)						\
 	({								\
 		u64 __val;						\
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 935f8f689433..fabd72b0c8a4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -14,6 +14,7 @@
 #include <asm/cacheflush.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>
+#include <asm/kvm_pgtable.h>
 #include <asm/kvm_ras.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
@@ -21,9 +22,7 @@
 
 #include "trace.h"
 
-static pgd_t *boot_hyp_pgd;
-static pgd_t *hyp_pgd;
-static pgd_t *merged_hyp_pgd;
+static struct kvm_pgtable *hyp_pgtable;
 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
 
 static unsigned long hyp_idmap_start;
@@ -32,8 +31,6 @@ static phys_addr_t hyp_idmap_vector;
 
 static unsigned long io_map_base;
 
-#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
-
 #define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
 #define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
 
@@ -489,338 +486,28 @@ static void stage2_flush_vm(struct kvm *kvm)
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
-static void clear_hyp_pgd_entry(pgd_t *pgd)
-{
-	p4d_t *p4d_table __maybe_unused = p4d_offset(pgd, 0UL);
-	pgd_clear(pgd);
-	p4d_free(NULL, p4d_table);
-	put_page(virt_to_page(pgd));
-}
-
-static void clear_hyp_p4d_entry(p4d_t *p4d)
-{
-	pud_t *pud_table __maybe_unused = pud_offset(p4d, 0UL);
-	VM_BUG_ON(p4d_huge(*p4d));
-	p4d_clear(p4d);
-	pud_free(NULL, pud_table);
-	put_page(virt_to_page(p4d));
-}
-
-static void clear_hyp_pud_entry(pud_t *pud)
-{
-	pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
-	VM_BUG_ON(pud_huge(*pud));
-	pud_clear(pud);
-	pmd_free(NULL, pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_hyp_pmd_entry(pmd_t *pmd)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(pmd_thp_or_huge(*pmd));
-	pmd_clear(pmd);
-	pte_free_kernel(NULL, pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte, *start_pte;
-
-	start_pte = pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			kvm_set_pte(pte, __pte(0));
-			put_page(virt_to_page(pte));
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	if (hyp_pte_table_empty(start_pte))
-		clear_hyp_pmd_entry(pmd);
-}
-
-static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	pmd_t *pmd, *start_pmd;
-
-	start_pmd = pmd = pmd_offset(pud, addr);
-	do {
-		next = pmd_addr_end(addr, end);
-		/* Hyp doesn't use huge pmds */
-		if (!pmd_none(*pmd))
-			unmap_hyp_ptes(pmd, addr, next);
-	} while (pmd++, addr = next, addr != end);
-
-	if (hyp_pmd_table_empty(start_pmd))
-		clear_hyp_pud_entry(pud);
-}
-
-static void unmap_hyp_puds(p4d_t *p4d, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	pud_t *pud, *start_pud;
-
-	start_pud = pud = pud_offset(p4d, addr);
-	do {
-		next = pud_addr_end(addr, end);
-		/* Hyp doesn't use huge puds */
-		if (!pud_none(*pud))
-			unmap_hyp_pmds(pud, addr, next);
-	} while (pud++, addr = next, addr != end);
-
-	if (hyp_pud_table_empty(start_pud))
-		clear_hyp_p4d_entry(p4d);
-}
-
-static void unmap_hyp_p4ds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t next;
-	p4d_t *p4d, *start_p4d;
-
-	start_p4d = p4d = p4d_offset(pgd, addr);
-	do {
-		next = p4d_addr_end(addr, end);
-		/* Hyp doesn't use huge p4ds */
-		if (!p4d_none(*p4d))
-			unmap_hyp_puds(p4d, addr, next);
-	} while (p4d++, addr = next, addr != end);
-
-	if (hyp_p4d_table_empty(start_p4d))
-		clear_hyp_pgd_entry(pgd);
-}
-
-static unsigned int kvm_pgd_index(unsigned long addr, unsigned int ptrs_per_pgd)
-{
-	return (addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1);
-}
-
-static void __unmap_hyp_range(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-			      phys_addr_t start, u64 size)
-{
-	pgd_t *pgd;
-	phys_addr_t addr = start, end = start + size;
-	phys_addr_t next;
-
-	/*
-	 * We don't unmap anything from HYP, except at the hyp tear down.
-	 * Hence, we don't have to invalidate the TLBs here.
-	 */
-	pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (!pgd_none(*pgd))
-			unmap_hyp_p4ds(pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
-}
-
-static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-	__unmap_hyp_range(pgdp, PTRS_PER_PGD, start, size);
-}
-
-static void unmap_hyp_idmap_range(pgd_t *pgdp, phys_addr_t start, u64 size)
-{
-	__unmap_hyp_range(pgdp, __kvm_idmap_ptrs_per_pgd(), start, size);
-}
-
 /**
  * free_hyp_pgds - free Hyp-mode page tables
- *
- * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
- * therefore contains either mappings in the kernel memory area (above
- * PAGE_OFFSET), or device mappings in the idmap range.
- *
- * boot_hyp_pgd should only map the idmap range, and is only used in
- * the extended idmap case.
  */
 void free_hyp_pgds(void)
 {
-	pgd_t *id_pgd;
-
 	mutex_lock(&kvm_hyp_pgd_mutex);
-
-	id_pgd = boot_hyp_pgd ? boot_hyp_pgd : hyp_pgd;
-
-	if (id_pgd) {
-		/* In case we never called hyp_mmu_init() */
-		if (!io_map_base)
-			io_map_base = hyp_idmap_start;
-		unmap_hyp_idmap_range(id_pgd, io_map_base,
-				      hyp_idmap_start + PAGE_SIZE - io_map_base);
+	if (hyp_pgtable) {
+		kvm_pgtable_hyp_destroy(hyp_pgtable);
+		kfree(hyp_pgtable);
 	}
-
-	if (boot_hyp_pgd) {
-		free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
-		boot_hyp_pgd = NULL;
-	}
-
-	if (hyp_pgd) {
-		unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET),
-				(uintptr_t)high_memory - PAGE_OFFSET);
-
-		free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
-		hyp_pgd = NULL;
-	}
-	if (merged_hyp_pgd) {
-		clear_page(merged_hyp_pgd);
-		free_page((unsigned long)merged_hyp_pgd);
-		merged_hyp_pgd = NULL;
-	}
-
 	mutex_unlock(&kvm_hyp_pgd_mutex);
 }
 
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
-				    unsigned long end, unsigned long pfn,
-				    pgprot_t prot)
+static int __create_hyp_mappings(unsigned long start, unsigned long size,
+				 unsigned long phys, enum kvm_pgtable_prot prot)
 {
-	pte_t *pte;
-	unsigned long addr;
-
-	addr = start;
-	do {
-		pte = pte_offset_kernel(pmd, addr);
-		kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
-		get_page(virt_to_page(pte));
-		pfn++;
-	} while (addr += PAGE_SIZE, addr != end);
-}
-
-static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long addr, next;
-
-	addr = start;
-	do {
-		pmd = pmd_offset(pud, addr);
-
-		BUG_ON(pmd_sect(*pmd));
-
-		if (pmd_none(*pmd)) {
-			pte = pte_alloc_one_kernel(NULL);
-			if (!pte) {
-				kvm_err("Cannot allocate Hyp pte\n");
-				return -ENOMEM;
-			}
-			kvm_pmd_populate(pmd, pte);
-			get_page(virt_to_page(pmd));
-		}
-
-		next = pmd_addr_end(addr, end);
-
-		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int create_hyp_pud_mappings(p4d_t *p4d, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	pud_t *pud;
-	pmd_t *pmd;
-	unsigned long addr, next;
-	int ret;
-
-	addr = start;
-	do {
-		pud = pud_offset(p4d, addr);
-
-		if (pud_none_or_clear_bad(pud)) {
-			pmd = pmd_alloc_one(NULL, addr);
-			if (!pmd) {
-				kvm_err("Cannot allocate Hyp pmd\n");
-				return -ENOMEM;
-			}
-			kvm_pud_populate(pud, pmd);
-			get_page(virt_to_page(pud));
-		}
-
-		next = pud_addr_end(addr, end);
-		ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
-		if (ret)
-			return ret;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int create_hyp_p4d_mappings(pgd_t *pgd, unsigned long start,
-				   unsigned long end, unsigned long pfn,
-				   pgprot_t prot)
-{
-	p4d_t *p4d;
-	pud_t *pud;
-	unsigned long addr, next;
-	int ret;
-
-	addr = start;
-	do {
-		p4d = p4d_offset(pgd, addr);
-
-		if (p4d_none(*p4d)) {
-			pud = pud_alloc_one(NULL, addr);
-			if (!pud) {
-				kvm_err("Cannot allocate Hyp pud\n");
-				return -ENOMEM;
-			}
-			kvm_p4d_populate(p4d, pud);
-			get_page(virt_to_page(p4d));
-		}
-
-		next = p4d_addr_end(addr, end);
-		ret = create_hyp_pud_mappings(p4d, addr, next, pfn, prot);
-		if (ret)
-			return ret;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-
-	return 0;
-}
-
-static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd,
-				 unsigned long start, unsigned long end,
-				 unsigned long pfn, pgprot_t prot)
-{
-	pgd_t *pgd;
-	p4d_t *p4d;
-	unsigned long addr, next;
-	int err = 0;
+	int err;
 
 	mutex_lock(&kvm_hyp_pgd_mutex);
-	addr = start & PAGE_MASK;
-	end = PAGE_ALIGN(end);
-	do {
-		pgd = pgdp + kvm_pgd_index(addr, ptrs_per_pgd);
-
-		if (pgd_none(*pgd)) {
-			p4d = p4d_alloc_one(NULL, addr);
-			if (!p4d) {
-				kvm_err("Cannot allocate Hyp p4d\n");
-				err = -ENOMEM;
-				goto out;
-			}
-			kvm_pgd_populate(pgd, p4d);
-			get_page(virt_to_page(pgd));
-		}
-
-		next = pgd_addr_end(addr, end);
-		err = create_hyp_p4d_mappings(pgd, addr, next, pfn, prot);
-		if (err)
-			goto out;
-		pfn += (next - addr) >> PAGE_SHIFT;
-	} while (addr = next, addr != end);
-out:
+	err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
 	mutex_unlock(&kvm_hyp_pgd_mutex);
+
 	return err;
 }
 
@@ -845,7 +532,7 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
  * physical pages.
  */
-int create_hyp_mappings(void *from, void *to, pgprot_t prot)
+int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
 {
 	phys_addr_t phys_addr;
 	unsigned long virt_addr;
@@ -862,9 +549,7 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 		int err;
 
 		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
-		err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD,
-					    virt_addr, virt_addr + PAGE_SIZE,
-					    __phys_to_pfn(phys_addr),
+		err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
 					    prot);
 		if (err)
 			return err;
@@ -874,9 +559,9 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 }
 
 static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
-					unsigned long *haddr, pgprot_t prot)
+					unsigned long *haddr,
+					enum kvm_pgtable_prot prot)
 {
-	pgd_t *pgd = hyp_pgd;
 	unsigned long base;
 	int ret = 0;
 
@@ -908,17 +593,11 @@ static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
 	if (ret)
 		goto out;
 
-	if (__kvm_cpu_uses_extended_idmap())
-		pgd = boot_hyp_pgd;
-
-	ret = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-				    base, base + size,
-				    __phys_to_pfn(phys_addr), prot);
+	ret = __create_hyp_mappings(base, size, phys_addr, prot);
 	if (ret)
 		goto out;
 
 	*haddr = base + offset_in_page(phys_addr);
-
 out:
 	return ret;
 }
@@ -2326,10 +2005,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 
 phys_addr_t kvm_mmu_get_httbr(void)
 {
-	if (__kvm_cpu_uses_extended_idmap())
-		return virt_to_phys(merged_hyp_pgd);
-	else
-		return virt_to_phys(hyp_pgd);
+	return __pa(hyp_pgtable->pgd);
 }
 
 phys_addr_t kvm_get_idmap_vector(void)
@@ -2337,15 +2013,11 @@ phys_addr_t kvm_get_idmap_vector(void)
 	return hyp_idmap_vector;
 }
 
-static int kvm_map_idmap_text(pgd_t *pgd)
+static int kvm_map_idmap_text(void)
 {
-	int err;
-
-	/* Create the idmap in the boot page tables */
-	err = 	__create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(),
-				      hyp_idmap_start, hyp_idmap_end,
-				      __phys_to_pfn(hyp_idmap_start),
-				      PAGE_HYP_EXEC);
+	unsigned long size = hyp_idmap_end - hyp_idmap_start;
+	int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
+					PAGE_HYP_EXEC);
 	if (err)
 		kvm_err("Failed to idmap %lx-%lx\n",
 			hyp_idmap_start, hyp_idmap_end);
@@ -2356,6 +2028,7 @@ static int kvm_map_idmap_text(pgd_t *pgd)
 int kvm_mmu_init(void)
 {
 	int err;
+	u32 hyp_va_bits;
 
 	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
 	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
@@ -2369,6 +2042,8 @@ int kvm_mmu_init(void)
 	 */
 	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
 
+	hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET);
+	kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits);
 	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
 	kvm_debug("HYP VA range: %lx:%lx\n",
 		  kern_hyp_va(PAGE_OFFSET),
@@ -2386,43 +2061,30 @@ int kvm_mmu_init(void)
 		goto out;
 	}
 
-	hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
-	if (!hyp_pgd) {
-		kvm_err("Hyp mode PGD not allocated\n");
+	hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
+	if (!hyp_pgtable) {
+		kvm_err("Hyp mode page-table not allocated\n");
 		err = -ENOMEM;
 		goto out;
 	}
 
-	if (__kvm_cpu_uses_extended_idmap()) {
-		boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-							 hyp_pgd_order);
-		if (!boot_hyp_pgd) {
-			kvm_err("Hyp boot PGD not allocated\n");
-			err = -ENOMEM;
-			goto out;
-		}
+	err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits);
+	if (err)
+		goto out_free_pgtable;
 
-		err = kvm_map_idmap_text(boot_hyp_pgd);
-		if (err)
-			goto out;
-
-		merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-		if (!merged_hyp_pgd) {
-			kvm_err("Failed to allocate extra HYP pgd\n");
-			goto out;
-		}
-		__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
-				    hyp_idmap_start);
-	} else {
-		err = kvm_map_idmap_text(hyp_pgd);
-		if (err)
-			goto out;
-	}
+	err = kvm_map_idmap_text();
+	if (err)
+		goto out_destroy_pgtable;
 
 	io_map_base = hyp_idmap_start;
 	return 0;
+
+out_destroy_pgtable:
+	kvm_pgtable_hyp_destroy(hyp_pgtable);
+out_free_pgtable:
+	kfree(hyp_pgtable);
+	hyp_pgtable = NULL;
 out:
-	free_hyp_pgds();
 	return err;
 }
 

From 71233d05f4b5e3560f0d3d5607d01e9beff8dcbd Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:13 +0100
Subject: [PATCH 05/87] KVM: arm64: Add support for creating kernel-agnostic
 stage-2 page tables

Introduce alloc() and free() functions to the generic page-table code
for guest stage-2 page-tables and plumb these into the existing KVM
page-table allocator. Subsequent patches will convert other operations
within the KVM allocator over to the generic code.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-6-will@kernel.org
---
 arch/arm64/include/asm/kvm_host.h    |  1 +
 arch/arm64/include/asm/kvm_pgtable.h | 18 +++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 54 +++++++++++++++++++++++++++
 arch/arm64/kvm/mmu.c                 | 55 +++++++++++++++-------------
 4 files changed, 102 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e52c927aade5..0b7c702b2151 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -81,6 +81,7 @@ struct kvm_s2_mmu {
 	 */
 	pgd_t		*pgd;
 	phys_addr_t	pgd_phys;
+	struct kvm_pgtable *pgt;
 
 	/* The last vcpu id that ran on each physical CPU */
 	int __percpu *last_vcpu_ran;
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index ff5d7d27eb39..21d71395a377 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -122,6 +122,24 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);
 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
 			enum kvm_pgtable_prot prot);
 
+/**
+ * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
+ * @pgt:	Uninitialised page-table structure to initialise.
+ * @kvm:	KVM structure representing the guest virtual machine.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
+
+/**
+ * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ *
+ * The page-table is assumed to be unreachable by any hardware walkers prior
+ * to freeing and therefore no TLB invalidation is performed.
+ */
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
+
 /**
  * kvm_pgtable_walk() - Walk a page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 23a1006aa4ef..16b34d11e7cf 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -416,3 +416,57 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
 	free_page((unsigned long)pgt->pgd);
 	pgt->pgd = NULL;
 }
+
+int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
+{
+	size_t pgd_sz;
+	u64 vtcr = kvm->arch.vtcr;
+	u32 ia_bits = VTCR_EL2_IPA(vtcr);
+	u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
+	u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;
+
+	pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
+	pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL | __GFP_ZERO);
+	if (!pgt->pgd)
+		return -ENOMEM;
+
+	pgt->ia_bits		= ia_bits;
+	pgt->start_level	= start_level;
+	pgt->mmu		= &kvm->arch.mmu;
+
+	/* Ensure zeroed PGD pages are visible to the hardware walker */
+	dsb(ishst);
+	return 0;
+}
+
+static int stage2_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			      enum kvm_pgtable_walk_flags flag,
+			      void * const arg)
+{
+	kvm_pte_t pte = *ptep;
+
+	if (!kvm_pte_valid(pte))
+		return 0;
+
+	put_page(virt_to_page(ptep));
+
+	if (kvm_pte_table(pte, level))
+		free_page((unsigned long)kvm_pte_follow(pte));
+
+	return 0;
+}
+
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+	size_t pgd_sz;
+	struct kvm_pgtable_walker walker = {
+		.cb	= stage2_free_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF |
+			  KVM_PGTABLE_WALK_TABLE_POST,
+	};
+
+	WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+	pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
+	free_pages_exact(pgt->pgd, pgd_sz);
+	pgt->pgd = NULL;
+}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index fabd72b0c8a4..4607e9ca60a2 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -668,47 +668,49 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
  * @kvm:	The pointer to the KVM structure
  * @mmu:	The pointer to the s2 MMU structure
  *
- * Allocates only the stage-2 HW PGD level table(s) of size defined by
- * stage2_pgd_size(mmu->kvm).
- *
+ * Allocates only the stage-2 HW PGD level table(s).
  * Note we don't need locking here as this is only called when the VM is
  * created, which can only be done once.
  */
 int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
 {
-	phys_addr_t pgd_phys;
-	pgd_t *pgd;
-	int cpu;
+	int cpu, err;
+	struct kvm_pgtable *pgt;
 
-	if (mmu->pgd != NULL) {
+	if (mmu->pgt != NULL) {
 		kvm_err("kvm_arch already initialized?\n");
 		return -EINVAL;
 	}
 
-	/* Allocate the HW PGD, making sure that each page gets its own refcount */
-	pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO);
-	if (!pgd)
+	pgt = kzalloc(sizeof(*pgt), GFP_KERNEL);
+	if (!pgt)
 		return -ENOMEM;
 
-	pgd_phys = virt_to_phys(pgd);
-	if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
-		return -EINVAL;
+	err = kvm_pgtable_stage2_init(pgt, kvm);
+	if (err)
+		goto out_free_pgtable;
 
 	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
 	if (!mmu->last_vcpu_ran) {
-		free_pages_exact(pgd, stage2_pgd_size(kvm));
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto out_destroy_pgtable;
 	}
 
 	for_each_possible_cpu(cpu)
 		*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;
 
 	mmu->kvm = kvm;
-	mmu->pgd = pgd;
-	mmu->pgd_phys = pgd_phys;
+	mmu->pgt = pgt;
+	mmu->pgd_phys = __pa(pgt->pgd);
+	mmu->pgd = (void *)pgt->pgd;
 	mmu->vmid.vmid_gen = 0;
-
 	return 0;
+
+out_destroy_pgtable:
+	kvm_pgtable_stage2_destroy(pgt);
+out_free_pgtable:
+	kfree(pgt);
+	return err;
 }
 
 static void stage2_unmap_memslot(struct kvm *kvm,
@@ -781,20 +783,21 @@ void stage2_unmap_vm(struct kvm *kvm)
 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 {
 	struct kvm *kvm = mmu->kvm;
-	void *pgd = NULL;
+	struct kvm_pgtable *pgt = NULL;
 
 	spin_lock(&kvm->mmu_lock);
-	if (mmu->pgd) {
-		unmap_stage2_range(mmu, 0, kvm_phys_size(kvm));
-		pgd = READ_ONCE(mmu->pgd);
+	pgt = mmu->pgt;
+	if (pgt) {
 		mmu->pgd = NULL;
+		mmu->pgd_phys = 0;
+		mmu->pgt = NULL;
+		free_percpu(mmu->last_vcpu_ran);
 	}
 	spin_unlock(&kvm->mmu_lock);
 
-	/* Free the HW pgd, one page at a time */
-	if (pgd) {
-		free_pages_exact(pgd, stage2_pgd_size(kvm));
-		free_percpu(mmu->last_vcpu_ran);
+	if (pgt) {
+		kvm_pgtable_stage2_destroy(pgt);
+		kfree(pgt);
 	}
 }
 

From 6d9d2115c48027132a69477c98d812be669f5b9b Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:14 +0100
Subject: [PATCH 06/87] KVM: arm64: Add support for stage-2 map()/unmap() in
 generic page-table

Add stage-2 map() and unmap() operations to the generic page-table code.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-7-will@kernel.org
---
 arch/arm64/include/asm/kvm_pgtable.h |  46 +++++
 arch/arm64/kvm/hyp/pgtable.c         | 273 +++++++++++++++++++++++++++
 2 files changed, 319 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 21d71395a377..895b2238062b 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -140,6 +140,52 @@ int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm);
  */
 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
 
+/**
+ * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address at which to place the mapping.
+ * @size:	Size of the mapping.
+ * @phys:	Physical address of the memory to map.
+ * @prot:	Permissions and attributes for the mapping.
+ * @mc:		Cache of pre-allocated GFP_PGTABLE_USER memory from which to
+ *		allocate page-table pages.
+ *
+ * The offset of @addr within a page is ignored, @size is rounded-up to
+ * the next page boundary and @phys is rounded-down to the previous page
+ * boundary.
+ *
+ * If device attributes are not explicitly requested in @prot, then the
+ * mapping will be normal, cacheable.
+ *
+ * Note that this function will both coalesce existing table entries and split
+ * existing block mappings, relying on page-faults to fault back areas outside
+ * of the new mapping lazily.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+			   u64 phys, enum kvm_pgtable_prot prot,
+			   struct kvm_mmu_memory_cache *mc);
+
+/**
+ * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address from which to remove the mapping.
+ * @size:	Size of the mapping.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * TLB invalidation is performed for each page-table entry cleared during the
+ * unmapping operation and the reference count for the page-table page
+ * containing the cleared entry is decremented, with unreferenced pages being
+ * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if
+ * FWB is not supported by the CPU.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
 /**
  * kvm_pgtable_walk() - Walk a page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 16b34d11e7cf..2f9b872f5355 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -32,10 +32,19 @@
 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS	3
 #define KVM_PTE_LEAF_ATTR_LO_S1_AF	BIT(10)
 
+#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR	GENMASK(5, 2)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R	BIT(6)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W	BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH	GENMASK(9, 8)
+#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS	3
+#define KVM_PTE_LEAF_ATTR_LO_S2_AF	BIT(10)
+
 #define KVM_PTE_LEAF_ATTR_HI		GENMASK(63, 51)
 
 #define KVM_PTE_LEAF_ATTR_HI_S1_XN	BIT(54)
 
+#define KVM_PTE_LEAF_ATTR_HI_S2_XN	BIT(54)
+
 struct kvm_pgtable_walk_data {
 	struct kvm_pgtable		*pgt;
 	struct kvm_pgtable_walker	*walker;
@@ -417,6 +426,270 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
 	pgt->pgd = NULL;
 }
 
+struct stage2_map_data {
+	u64				phys;
+	kvm_pte_t			attr;
+
+	kvm_pte_t			*anchor;
+
+	struct kvm_s2_mmu		*mmu;
+	struct kvm_mmu_memory_cache	*memcache;
+};
+
+static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot,
+				    struct stage2_map_data *data)
+{
+	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
+	kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) :
+			    PAGE_S2_MEMATTR(NORMAL);
+	u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+
+	if (!(prot & KVM_PGTABLE_PROT_X))
+		attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+	else if (device)
+		return -EINVAL;
+
+	if (prot & KVM_PGTABLE_PROT_R)
+		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+	if (prot & KVM_PGTABLE_PROT_W)
+		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+	attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);
+	attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+	data->attr = attr;
+	return 0;
+}
+
+static bool stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level,
+				       kvm_pte_t *ptep,
+				       struct stage2_map_data *data)
+{
+	u64 granule = kvm_granule_size(level), phys = data->phys;
+
+	if (!kvm_block_mapping_supported(addr, end, phys, level))
+		return false;
+
+	if (kvm_set_valid_leaf_pte(ptep, phys, data->attr, level))
+		goto out;
+
+	/* There's an existing valid leaf entry, so perform break-before-make */
+	kvm_set_invalid_pte(ptep);
+	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+	kvm_set_valid_leaf_pte(ptep, phys, data->attr, level);
+out:
+	data->phys += granule;
+	return true;
+}
+
+static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level,
+				     kvm_pte_t *ptep,
+				     struct stage2_map_data *data)
+{
+	if (data->anchor)
+		return 0;
+
+	if (!kvm_block_mapping_supported(addr, end, data->phys, level))
+		return 0;
+
+	kvm_set_invalid_pte(ptep);
+	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, 0);
+	data->anchor = ptep;
+	return 0;
+}
+
+static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+				struct stage2_map_data *data)
+{
+	kvm_pte_t *childp, pte = *ptep;
+	struct page *page = virt_to_page(ptep);
+
+	if (data->anchor) {
+		if (kvm_pte_valid(pte))
+			put_page(page);
+
+		return 0;
+	}
+
+	if (stage2_map_walker_try_leaf(addr, end, level, ptep, data))
+		goto out_get_page;
+
+	if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1))
+		return -EINVAL;
+
+	if (!data->memcache)
+		return -ENOMEM;
+
+	childp = kvm_mmu_memory_cache_alloc(data->memcache);
+	if (!childp)
+		return -ENOMEM;
+
+	/*
+	 * If we've run into an existing block mapping then replace it with
+	 * a table. Accesses beyond 'end' that fall within the new table
+	 * will be mapped lazily.
+	 */
+	if (kvm_pte_valid(pte)) {
+		kvm_set_invalid_pte(ptep);
+		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level);
+		put_page(page);
+	}
+
+	kvm_set_table_pte(ptep, childp);
+
+out_get_page:
+	get_page(page);
+	return 0;
+}
+
+static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
+				      kvm_pte_t *ptep,
+				      struct stage2_map_data *data)
+{
+	int ret = 0;
+
+	if (!data->anchor)
+		return 0;
+
+	free_page((unsigned long)kvm_pte_follow(*ptep));
+	put_page(virt_to_page(ptep));
+
+	if (data->anchor == ptep) {
+		data->anchor = NULL;
+		ret = stage2_map_walk_leaf(addr, end, level, ptep, data);
+	}
+
+	return ret;
+}
+
+/*
+ * This is a little fiddly, as we use all three of the walk flags. The idea
+ * is that the TABLE_PRE callback runs for table entries on the way down,
+ * looking for table entries which we could conceivably replace with a
+ * block entry for this mapping. If it finds one, then it sets the 'anchor'
+ * field in 'struct stage2_map_data' to point at the table entry, before
+ * clearing the entry to zero and descending into the now detached table.
+ *
+ * The behaviour of the LEAF callback then depends on whether or not the
+ * anchor has been set. If not, then we're not using a block mapping higher
+ * up the table and we perform the mapping at the existing leaves instead.
+ * If, on the other hand, the anchor _is_ set, then we drop references to
+ * all valid leaves so that the pages beneath the anchor can be freed.
+ *
+ * Finally, the TABLE_POST callback does nothing if the anchor has not
+ * been set, but otherwise frees the page-table pages while walking back up
+ * the page-table, installing the block entry when it revisits the anchor
+ * pointer and clearing the anchor to NULL.
+ */
+static int stage2_map_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			     enum kvm_pgtable_walk_flags flag, void * const arg)
+{
+	struct stage2_map_data *data = arg;
+
+	switch (flag) {
+	case KVM_PGTABLE_WALK_TABLE_PRE:
+		return stage2_map_walk_table_pre(addr, end, level, ptep, data);
+	case KVM_PGTABLE_WALK_LEAF:
+		return stage2_map_walk_leaf(addr, end, level, ptep, data);
+	case KVM_PGTABLE_WALK_TABLE_POST:
+		return stage2_map_walk_table_post(addr, end, level, ptep, data);
+	}
+
+	return -EINVAL;
+}
+
+int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+			   u64 phys, enum kvm_pgtable_prot prot,
+			   struct kvm_mmu_memory_cache *mc)
+{
+	int ret;
+	struct stage2_map_data map_data = {
+		.phys		= ALIGN_DOWN(phys, PAGE_SIZE),
+		.mmu		= pgt->mmu,
+		.memcache	= mc,
+	};
+	struct kvm_pgtable_walker walker = {
+		.cb		= stage2_map_walker,
+		.flags		= KVM_PGTABLE_WALK_TABLE_PRE |
+				  KVM_PGTABLE_WALK_LEAF |
+				  KVM_PGTABLE_WALK_TABLE_POST,
+		.arg		= &map_data,
+	};
+
+	ret = stage2_map_set_prot_attr(prot, &map_data);
+	if (ret)
+		return ret;
+
+	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+	dsb(ishst);
+	return ret;
+}
+
+static void stage2_flush_dcache(void *addr, u64 size)
+{
+	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+		return;
+
+	__flush_dcache_area(addr, size);
+}
+
+static bool stage2_pte_cacheable(kvm_pte_t pte)
+{
+	u64 memattr = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR, pte);
+	return memattr == PAGE_S2_MEMATTR(NORMAL);
+}
+
+static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			       enum kvm_pgtable_walk_flags flag,
+			       void * const arg)
+{
+	struct kvm_s2_mmu *mmu = arg;
+	kvm_pte_t pte = *ptep, *childp = NULL;
+	bool need_flush = false;
+
+	if (!kvm_pte_valid(pte))
+		return 0;
+
+	if (kvm_pte_table(pte, level)) {
+		childp = kvm_pte_follow(pte);
+
+		if (page_count(virt_to_page(childp)) != 1)
+			return 0;
+	} else if (stage2_pte_cacheable(pte)) {
+		need_flush = true;
+	}
+
+	/*
+	 * This is similar to the map() path in that we unmap the entire
+	 * block entry and rely on the remaining portions being faulted
+	 * back lazily.
+	 */
+	kvm_set_invalid_pte(ptep);
+	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level);
+	put_page(virt_to_page(ptep));
+
+	if (need_flush) {
+		stage2_flush_dcache(kvm_pte_follow(pte),
+				    kvm_granule_size(level));
+	}
+
+	if (childp)
+		free_page((unsigned long)childp);
+
+	return 0;
+}
+
+int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= stage2_unmap_walker,
+		.arg	= pgt->mmu,
+		.flags	= KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
+	};
+
+	return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
 int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
 {
 	size_t pgd_sz;

From 02bbd374ce4a8aad36c022fd1203cb9f11b52ee3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:15 +0100
Subject: [PATCH 07/87] KVM: arm64: Convert kvm_phys_addr_ioremap() to generic
 page-table API

Convert kvm_phys_addr_ioremap() to use kvm_pgtable_stage2_map() instead
of stage2_set_pte().

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-8-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 4607e9ca60a2..32e93f42aed0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1154,35 +1154,33 @@ static int stage2_pudp_test_and_clear_young(pud_t *pud)
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 			  phys_addr_t pa, unsigned long size, bool writable)
 {
-	phys_addr_t addr, end;
+	phys_addr_t addr;
 	int ret = 0;
-	unsigned long pfn;
 	struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
+	struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
+				     KVM_PGTABLE_PROT_R |
+				     (writable ? KVM_PGTABLE_PROT_W : 0);
 
-	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
-	pfn = __phys_to_pfn(pa);
-
-	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
-		pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
-
-		if (writable)
-			pte = kvm_s2pte_mkwrite(pte);
+	size += offset_in_page(guest_ipa);
+	guest_ipa &= PAGE_MASK;
 
+	for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
 		ret = kvm_mmu_topup_memory_cache(&cache,
 						 kvm_mmu_cache_min_pages(kvm));
 		if (ret)
-			goto out;
+			break;
+
 		spin_lock(&kvm->mmu_lock);
-		ret = stage2_set_pte(&kvm->arch.mmu, &cache, addr, &pte,
-				     KVM_S2PTE_FLAG_IS_IOMAP);
+		ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
+					     &cache);
 		spin_unlock(&kvm->mmu_lock);
 		if (ret)
-			goto out;
+			break;
 
-		pfn++;
+		pa += PAGE_SIZE;
 	}
 
-out:
 	kvm_mmu_free_memory_cache(&cache);
 	return ret;
 }

From e9edb17ae0a2634bfa5dac529229c06f97f2b669 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:16 +0100
Subject: [PATCH 08/87] KVM: arm64: Convert kvm_set_spte_hva() to generic
 page-table API

Convert kvm_set_spte_hva() to use kvm_pgtable_stage2_map() instead
of stage2_set_pte().

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-9-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 32e93f42aed0..d8cfdd25eac5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1914,28 +1914,27 @@ int kvm_unmap_hva_range(struct kvm *kvm,
 
 static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
-	pte_t *pte = (pte_t *)data;
+	kvm_pfn_t *pfn = (kvm_pfn_t *)data;
 
 	WARN_ON(size != PAGE_SIZE);
+
 	/*
-	 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
-	 * flag clear because MMU notifiers will have unmapped a huge PMD before
-	 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
-	 * therefore stage2_set_pte() never needs to clear out a huge PMD
-	 * through this calling path.
+	 * The MMU notifiers will have unmapped a huge PMD before calling
+	 * ->change_pte() (which in turn calls kvm_set_spte_hva()) and
+	 * therefore we never need to clear out a huge PMD through this
+	 * calling path and a memcache is not required.
 	 */
-	stage2_set_pte(&kvm->arch.mmu, NULL, gpa, pte, 0);
+	kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, gpa, PAGE_SIZE,
+			       __pfn_to_phys(*pfn), KVM_PGTABLE_PROT_R, NULL);
 	return 0;
 }
 
-
 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 {
 	unsigned long end = hva + PAGE_SIZE;
 	kvm_pfn_t pfn = pte_pfn(pte);
-	pte_t stage2_pte;
 
-	if (!kvm->arch.mmu.pgd)
+	if (!kvm->arch.mmu.pgt)
 		return 0;
 
 	trace_kvm_set_spte_hva(hva);
@@ -1945,9 +1944,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 	 * just like a translation fault and clean the cache to the PoC.
 	 */
 	clean_dcache_guest_page(pfn, PAGE_SIZE);
-	stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
-	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
-
+	handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pfn);
 	return 0;
 }
 

From 52bae936f0e7be4b4c088505aecb8f04121f1dee Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:17 +0100
Subject: [PATCH 09/87] KVM: arm64: Convert unmap_stage2_range() to generic
 page-table API

Convert unmap_stage2_range() to use kvm_pgtable_stage2_unmap() instead
of walking the page-table directly.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-10-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 62 ++++++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d8cfdd25eac5..f03d3d0af10d 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -39,6 +39,38 @@ static bool is_iomap(unsigned long flags)
 	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
 }
 
+/*
+ * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
+ * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
+ * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
+ * long will also starve other vCPUs. We have to also make sure that the page
+ * tables are not freed while we released the lock.
+ */
+static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+			      phys_addr_t end,
+			      int (*fn)(struct kvm_pgtable *, u64, u64),
+			      bool resched)
+{
+	int ret;
+	u64 next;
+
+	do {
+		struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+		if (!pgt)
+			return -EINVAL;
+
+		next = stage2_pgd_addr_end(kvm, addr, end);
+		ret = fn(pgt, addr, next - addr);
+		if (ret)
+			break;
+
+		if (resched && next != end)
+			cond_resched_lock(&kvm->mmu_lock);
+	} while (addr = next, addr != end);
+
+	return ret;
+}
+
 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
 {
 	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -220,8 +252,8 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
  * end up writing old data to disk.
  *
  * This is why right after unmapping a page/section and invalidating
- * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
- * the IO subsystem will never hit in the cache.
+ * the corresponding TLBs, we flush to make sure the IO subsystem will
+ * never hit in the cache.
  *
  * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
  * we then fully enforce cacheability of RAM, no matter what the guest
@@ -344,32 +376,12 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
 				 bool may_block)
 {
 	struct kvm *kvm = mmu->kvm;
-	pgd_t *pgd;
-	phys_addr_t addr = start, end = start + size;
-	phys_addr_t next;
+	phys_addr_t end = start + size;
 
 	assert_spin_locked(&kvm->mmu_lock);
 	WARN_ON(size & ~PAGE_MASK);
-
-	pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
-	do {
-		/*
-		 * Make sure the page table is still active, as another thread
-		 * could have possibly freed the page table, while we released
-		 * the lock.
-		 */
-		if (!READ_ONCE(mmu->pgd))
-			break;
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (!stage2_pgd_none(kvm, *pgd))
-			unmap_stage2_p4ds(mmu, pgd, addr, next);
-		/*
-		 * If the range is too large, release the kvm->mmu_lock
-		 * to prevent starvation and lockup detector warnings.
-		 */
-		if (may_block && next != end)
-			cond_resched_lock(&kvm->mmu_lock);
-	} while (pgd++, addr = next, addr != end);
+	WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+				   may_block));
 }
 
 static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)

From e0e5a07f3f5adca259ffa8ad3f4153ab448a4115 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:18 +0100
Subject: [PATCH 10/87] KVM: arm64: Add support for stage-2 page-aging in
 generic page-table

Add stage-2 mkyoung(), mkold() and is_young() operations to the generic
page-table code.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-11-will@kernel.org
---
 arch/arm64/include/asm/kvm_pgtable.h | 44 ++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 86 ++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 895b2238062b..50782128c861 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -186,6 +186,50 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
  */
 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
 
+/**
+ * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * set the access flag in that entry.
+ *
+ * Return: The old page-table entry prior to setting the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * clear the access flag in that entry.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: The old page-table entry prior to clearing the flag, 0 on failure.
+ */
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
+
+/**
+ * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
+ *				   access flag set.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address to identify the page-table entry.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * Return: True if the page-table entry has the access flag set, false otherwise.
+ */
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
+
 /**
  * kvm_pgtable_walk() - Walk a page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 2f9b872f5355..af60ea8ee29d 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -690,6 +690,92 @@ int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
 	return kvm_pgtable_walk(pgt, addr, size, &walker);
 }
 
+struct stage2_attr_data {
+	kvm_pte_t	attr_set;
+	kvm_pte_t	attr_clr;
+	kvm_pte_t	pte;
+};
+
+static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			      enum kvm_pgtable_walk_flags flag,
+			      void * const arg)
+{
+	kvm_pte_t pte = *ptep;
+	struct stage2_attr_data *data = arg;
+
+	if (!kvm_pte_valid(pte))
+		return 0;
+
+	data->pte = pte;
+	pte &= ~data->attr_clr;
+	pte |= data->attr_set;
+
+	/*
+	 * We may race with the CPU trying to set the access flag here,
+	 * but worst-case the access flag update gets lost and will be
+	 * set on the next access instead.
+	 */
+	if (data->pte != pte)
+		WRITE_ONCE(*ptep, pte);
+
+	return 0;
+}
+
+static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
+				    u64 size, kvm_pte_t attr_set,
+				    kvm_pte_t attr_clr, kvm_pte_t *orig_pte)
+{
+	int ret;
+	kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
+	struct stage2_attr_data data = {
+		.attr_set	= attr_set & attr_mask,
+		.attr_clr	= attr_clr & attr_mask,
+	};
+	struct kvm_pgtable_walker walker = {
+		.cb		= stage2_attr_walker,
+		.arg		= &data,
+		.flags		= KVM_PGTABLE_WALK_LEAF,
+	};
+
+	ret = kvm_pgtable_walk(pgt, addr, size, &walker);
+	if (ret)
+		return ret;
+
+	if (orig_pte)
+		*orig_pte = data.pte;
+	return 0;
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
+{
+	kvm_pte_t pte = 0;
+	stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
+				 &pte);
+	dsb(ishst);
+	return pte;
+}
+
+kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
+{
+	kvm_pte_t pte = 0;
+	stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
+				 &pte);
+	/*
+	 * "But where's the TLBI?!", you scream.
+	 * "Over in the core code", I sigh.
+	 *
+	 * See the '->clear_flush_young()' callback on the KVM mmu notifier.
+	 */
+	return pte;
+}
+
+bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
+{
+	kvm_pte_t pte = 0;
+	stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte);
+	return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
+}
+
 int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
 {
 	size_t pgd_sz;

From ee8efad7994ce204806260bc0261c7e103f704c1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:19 +0100
Subject: [PATCH 11/87] KVM: arm64: Convert page-aging and access faults to
 generic page-table API

Convert the page-aging functions and access fault handler to use the
generic page-table code instead of walking the page-table directly.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-12-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 74 ++++++++++----------------------------------
 1 file changed, 16 insertions(+), 58 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index f03d3d0af10d..36407dcdf01e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1706,46 +1706,23 @@ out_unlock:
 	return ret;
 }
 
-/*
- * Resolve the access fault by making the page young again.
- * Note that because the faulting entry is guaranteed not to be
- * cached in the TLB, we don't need to invalidate anything.
- * Only the HW Access Flag updates are supported for Stage 2 (no DBM),
- * so there is no need for atomic (pte|pmd)_mkyoung operations.
- */
+/* Resolve the access fault by making the page young again. */
 static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	kvm_pfn_t pfn;
-	bool pfn_valid = false;
+	pte_t pte;
+	kvm_pte_t kpte;
+	struct kvm_s2_mmu *mmu;
 
 	trace_kvm_access_fault(fault_ipa);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
-
-	if (!stage2_get_leaf_entry(vcpu->arch.hw_mmu, fault_ipa, &pud, &pmd, &pte))
-		goto out;
-
-	if (pud) {		/* HugeTLB */
-		*pud = kvm_s2pud_mkyoung(*pud);
-		pfn = kvm_pud_pfn(*pud);
-		pfn_valid = true;
-	} else	if (pmd) {	/* THP, HugeTLB */
-		*pmd = pmd_mkyoung(*pmd);
-		pfn = pmd_pfn(*pmd);
-		pfn_valid = true;
-	} else {
-		*pte = pte_mkyoung(*pte);	/* Just a page... */
-		pfn = pte_pfn(*pte);
-		pfn_valid = true;
-	}
-
-out:
+	mmu = vcpu->arch.hw_mmu;
+	kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-	if (pfn_valid)
-		kvm_set_pfn_accessed(pfn);
+
+	pte = __pte(kpte);
+	if (pte_valid(pte))
+		kvm_set_pfn_accessed(pte_pfn(pte));
 }
 
 /**
@@ -1962,38 +1939,19 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 
 static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	pte_t pte;
+	kvm_pte_t kpte;
 
 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-	if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
-		return 0;
-
-	if (pud)
-		return stage2_pudp_test_and_clear_young(pud);
-	else if (pmd)
-		return stage2_pmdp_test_and_clear_young(pmd);
-	else
-		return stage2_ptep_test_and_clear_young(pte);
+	kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt, gpa);
+	pte = __pte(kpte);
+	return pte_valid(pte) && pte_young(pte);
 }
 
 static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
 {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-
 	WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
-	if (!stage2_get_leaf_entry(&kvm->arch.mmu, gpa, &pud, &pmd, &pte))
-		return 0;
-
-	if (pud)
-		return kvm_s2pud_young(*pud);
-	else if (pmd)
-		return pmd_young(*pmd);
-	else
-		return pte_young(*pte);
+	return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, gpa);
 }
 
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)

From 73d49df2c3e7cad5bf64c92ec03ce9e3823898b9 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Fri, 11 Sep 2020 14:25:20 +0100
Subject: [PATCH 12/87] KVM: arm64: Add support for stage-2 write-protect in
 generic page-table

Add a stage-2 wrprotect() operation to the generic page-table code.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200911132529.19844-13-will@kernel.org
---
 arch/arm64/include/asm/kvm_pgtable.h | 18 ++++++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c         |  6 ++++++
 2 files changed, 24 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 50782128c861..5ae6006a6098 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -186,6 +186,24 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
  */
 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
 
+/**
+ * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
+ *                                  without TLB invalidation.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address from which to write-protect,
+ * @size:	Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Note that it is the caller's responsibility to invalidate the TLB after
+ * calling this function to ensure that the updated permissions are visible
+ * to the CPUs.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
 /**
  * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
  * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index af60ea8ee29d..480b95030f54 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -746,6 +746,12 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
 	return 0;
 }
 
+int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+	return stage2_update_leaf_attrs(pgt, addr, size, 0,
+					KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, NULL);
+}
+
 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
 {
 	kvm_pte_t pte = 0;

From cc38d61cace392a5f064f2981ce6009bb158de21 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Fri, 11 Sep 2020 14:25:21 +0100
Subject: [PATCH 13/87] KVM: arm64: Convert write-protect operation to generic
 page-table API

Convert stage2_wp_range() to call the kvm_pgtable_stage2_wrprotect()
function of the generic page-table code instead of walking the page-table
directly.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200911132529.19844-14-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 36407dcdf01e..a37da87eb7eb 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -71,6 +71,9 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
 	return ret;
 }
 
+#define stage2_apply_range_resched(kvm, addr, end, fn)			\
+	stage2_apply_range(kvm, addr, end, fn, true)
+
 static bool memslot_is_logging(struct kvm_memory_slot *memslot)
 {
 	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -1302,27 +1305,7 @@ static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
 {
 	struct kvm *kvm = mmu->kvm;
-	pgd_t *pgd;
-	phys_addr_t next;
-
-	pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
-	do {
-		/*
-		 * Release kvm_mmu_lock periodically if the memory region is
-		 * large. Otherwise, we may see kernel panics with
-		 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
-		 * CONFIG_LOCKDEP. Additionally, holding the lock too long
-		 * will also starve other vCPUs. We have to also make sure
-		 * that the page tables are not freed while we released
-		 * the lock.
-		 */
-		cond_resched_lock(&kvm->mmu_lock);
-		if (!READ_ONCE(mmu->pgd))
-			break;
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (stage2_pgd_present(kvm, *pgd))
-			stage2_wp_p4ds(mmu, pgd, addr, next);
-	} while (pgd++, addr = next, addr != end);
+	stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
 }
 
 /**

From 93c66b40d728178722706a306df15ce92c819cd6 Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Fri, 11 Sep 2020 14:25:22 +0100
Subject: [PATCH 14/87] KVM: arm64: Add support for stage-2 cache flushing in
 generic page-table

Add support for cache flushing a range of the stage-2 address space to
the generic page-table code.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200911132529.19844-15-will@kernel.org
---
 arch/arm64/include/asm/kvm_pgtable.h | 15 +++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 26 ++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 5ae6006a6098..77c027456c61 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -248,6 +248,21 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
  */
 bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr);
 
+/**
+ * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
+ * 				      of Coherency for guest stage-2 address
+ *				      range.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address from which to flush.
+ * @size:	Size of the range.
+ *
+ * The offset of @addr within a page is ignored and @size is rounded-up to
+ * the next page boundary.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
+
 /**
  * kvm_pgtable_walk() - Walk a page-table.
  * @pgt:	Page-table structure initialised by kvm_pgtable_*_init().
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 480b95030f54..d382756a527c 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -782,6 +782,32 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
 	return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
 }
 
+static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
+			       enum kvm_pgtable_walk_flags flag,
+			       void * const arg)
+{
+	kvm_pte_t pte = *ptep;
+
+	if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte))
+		return 0;
+
+	stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level));
+	return 0;
+}
+
+int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+	struct kvm_pgtable_walker walker = {
+		.cb	= stage2_flush_walker,
+		.flags	= KVM_PGTABLE_WALK_LEAF,
+	};
+
+	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+		return 0;
+
+	return kvm_pgtable_walk(pgt, addr, size, &walker);
+}
+
 int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm)
 {
 	size_t pgd_sz;

From 8d5207bef6913b5fac91cb58e509711e406c160d Mon Sep 17 00:00:00 2001
From: Quentin Perret <qperret@google.com>
Date: Fri, 11 Sep 2020 14:25:23 +0100
Subject: [PATCH 15/87] KVM: arm64: Convert memslot cache-flushing code to
 generic page-table API

Convert stage2_flush_memslot() to call the kvm_pgtable_stage2_flush()
function of the generic page-table code instead of walking the page-table
directly.

Signed-off-by: Quentin Perret <qperret@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200911132529.19844-16-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a37da87eb7eb..d5813996c0e4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -460,21 +460,10 @@ static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
 static void stage2_flush_memslot(struct kvm *kvm,
 				 struct kvm_memory_slot *memslot)
 {
-	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
 	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
 	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
-	phys_addr_t next;
-	pgd_t *pgd;
 
-	pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
-	do {
-		next = stage2_pgd_addr_end(kvm, addr, end);
-		if (!stage2_pgd_none(kvm, *pgd))
-			stage2_flush_p4ds(mmu, pgd, addr, next);
-
-		if (next != end)
-			cond_resched_lock(&kvm->mmu_lock);
-	} while (pgd++, addr = next, addr != end);
+	stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
 }
 
 /**

From adcd4e23291e67575be0c8b08eed04fbe7fd9e77 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:24 +0100
Subject: [PATCH 16/87] KVM: arm64: Add support for relaxing stage-2 perms in
 generic page-table code

Add support for relaxing the permissions of a stage-2 mapping (i.e.
adding additional permissions) to the generic page-table code.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-17-will@kernel.org
---
 arch/arm64/include/asm/kvm_pgtable.h | 19 +++++++++++++++++++
 arch/arm64/kvm/hyp/pgtable.c         | 20 ++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 77c027456c61..52ab38db04c7 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -236,6 +236,25 @@ kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr);
  */
 kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr);
 
+/**
+ * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
+ *				      page-table entry.
+ * @pgt:	Page-table structure initialised by kvm_pgtable_stage2_init().
+ * @addr:	Intermediate physical address to identify the page-table entry.
+ * @prot:	Additional permissions to grant for the mapping.
+ *
+ * The offset of @addr within a page is ignored.
+ *
+ * If there is a valid, leaf page-table entry used to translate @addr, then
+ * relax the permissions in that entry according to the read, write and
+ * execute permissions specified by @prot. No permissions are removed, and
+ * TLB invalidation is performed after updating the entry.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+				   enum kvm_pgtable_prot prot);
+
 /**
  * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the
  *				   access flag set.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index d382756a527c..603d6b415337 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -782,6 +782,26 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
 	return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
 }
 
+int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
+				   enum kvm_pgtable_prot prot)
+{
+	int ret;
+	kvm_pte_t set = 0, clr = 0;
+
+	if (prot & KVM_PGTABLE_PROT_R)
+		set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+
+	if (prot & KVM_PGTABLE_PROT_W)
+		set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
+	if (prot & KVM_PGTABLE_PROT_X)
+		clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+
+	ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL);
+	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, 0);
+	return ret;
+}
+
 static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 			       enum kvm_pgtable_walk_flags flag,
 			       void * const arg)

From 6f745f1bb5bf2914be2344b0dd7781d0528b3e42 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:25 +0100
Subject: [PATCH 17/87] KVM: arm64: Convert user_mem_abort() to generic
 page-table API

Convert user_mem_abort() to call kvm_pgtable_stage2_relax_perms() when
handling a stage-2 permission fault and kvm_pgtable_stage2_map() when
handling a stage-2 translation fault, rather than walking the page-table
manually.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-18-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 124 +++++++++++++++----------------------------
 1 file changed, 44 insertions(+), 80 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index d5813996c0e4..e980e614dd81 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1499,18 +1499,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 {
 	int ret;
 	bool write_fault, writable, force_pte = false;
-	bool exec_fault, needs_exec;
+	bool exec_fault;
+	bool device = false;
 	unsigned long mmu_seq;
-	gfn_t gfn = fault_ipa >> PAGE_SHIFT;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
 	struct vm_area_struct *vma;
 	short vma_shift;
+	gfn_t gfn;
 	kvm_pfn_t pfn;
-	pgprot_t mem_type = PAGE_S2;
 	bool logging_active = memslot_is_logging(memslot);
-	unsigned long vma_pagesize, flags = 0;
-	struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;
+	unsigned long vma_pagesize;
+	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
+	struct kvm_pgtable *pgt;
 
 	write_fault = kvm_is_write_fault(vcpu);
 	exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
@@ -1543,22 +1544,24 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		vma_pagesize = PAGE_SIZE;
 	}
 
-	/*
-	 * The stage2 has a minimum of 2 level table (For arm64 see
-	 * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
-	 * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
-	 * As for PUD huge maps, we must make sure that we have at least
-	 * 3 levels, i.e, PMD is not folded.
-	 */
-	if (vma_pagesize == PMD_SIZE ||
-	    (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
-		gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
+	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
+		fault_ipa &= huge_page_mask(hstate_vma(vma));
+
+	gfn = fault_ipa >> PAGE_SHIFT;
 	mmap_read_unlock(current->mm);
 
-	/* We need minimum second+third level pages */
-	ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm));
-	if (ret)
-		return ret;
+	/*
+	 * Permission faults just need to update the existing leaf entry,
+	 * and so normally don't require allocations from the memcache. The
+	 * only exception to this is when dirty logging is enabled at runtime
+	 * and a write fault needs to collapse a block entry into a table.
+	 */
+	if (fault_status != FSC_PERM || (logging_active && write_fault)) {
+		ret = kvm_mmu_topup_memory_cache(memcache,
+						 kvm_mmu_cache_min_pages(kvm));
+		if (ret)
+			return ret;
+	}
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	/*
@@ -1581,28 +1584,20 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		return -EFAULT;
 
 	if (kvm_is_device_pfn(pfn)) {
-		mem_type = PAGE_S2_DEVICE;
-		flags |= KVM_S2PTE_FLAG_IS_IOMAP;
-	} else if (logging_active) {
-		/*
-		 * Faults on pages in a memslot with logging enabled
-		 * should not be mapped with huge pages (it introduces churn
-		 * and performance degradation), so force a pte mapping.
-		 */
-		flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
-
+		device = true;
+	} else if (logging_active && !write_fault) {
 		/*
 		 * Only actually map the page as writable if this was a write
 		 * fault.
 		 */
-		if (!write_fault)
-			writable = false;
+		writable = false;
 	}
 
-	if (exec_fault && is_iomap(flags))
+	if (exec_fault && device)
 		return -ENOEXEC;
 
 	spin_lock(&kvm->mmu_lock);
+	pgt = vcpu->arch.hw_mmu->pgt;
 	if (mmu_notifier_retry(kvm, mmu_seq))
 		goto out_unlock;
 
@@ -1613,62 +1608,31 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (vma_pagesize == PAGE_SIZE && !force_pte)
 		vma_pagesize = transparent_hugepage_adjust(memslot, hva,
 							   &pfn, &fault_ipa);
-	if (writable)
+	if (writable) {
+		prot |= KVM_PGTABLE_PROT_W;
 		kvm_set_pfn_dirty(pfn);
+		mark_page_dirty(kvm, gfn);
+	}
 
-	if (fault_status != FSC_PERM && !is_iomap(flags))
+	if (fault_status != FSC_PERM && !device)
 		clean_dcache_guest_page(pfn, vma_pagesize);
 
-	if (exec_fault)
+	if (exec_fault) {
+		prot |= KVM_PGTABLE_PROT_X;
 		invalidate_icache_guest_page(pfn, vma_pagesize);
+	}
 
-	/*
-	 * If we took an execution fault we have made the
-	 * icache/dcache coherent above and should now let the s2
-	 * mapping be executable.
-	 *
-	 * Write faults (!exec_fault && FSC_PERM) are orthogonal to
-	 * execute permissions, and we preserve whatever we have.
-	 */
-	needs_exec = exec_fault ||
-		(fault_status == FSC_PERM &&
-		 stage2_is_exec(mmu, fault_ipa, vma_pagesize));
+	if (device)
+		prot |= KVM_PGTABLE_PROT_DEVICE;
+	else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+		prot |= KVM_PGTABLE_PROT_X;
 
-	if (vma_pagesize == PUD_SIZE) {
-		pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
-
-		new_pud = kvm_pud_mkhuge(new_pud);
-		if (writable)
-			new_pud = kvm_s2pud_mkwrite(new_pud);
-
-		if (needs_exec)
-			new_pud = kvm_s2pud_mkexec(new_pud);
-
-		ret = stage2_set_pud_huge(mmu, memcache, fault_ipa, &new_pud);
-	} else if (vma_pagesize == PMD_SIZE) {
-		pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
-
-		new_pmd = kvm_pmd_mkhuge(new_pmd);
-
-		if (writable)
-			new_pmd = kvm_s2pmd_mkwrite(new_pmd);
-
-		if (needs_exec)
-			new_pmd = kvm_s2pmd_mkexec(new_pmd);
-
-		ret = stage2_set_pmd_huge(mmu, memcache, fault_ipa, &new_pmd);
+	if (fault_status == FSC_PERM && !(logging_active && writable)) {
+		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
 	} else {
-		pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
-
-		if (writable) {
-			new_pte = kvm_s2pte_mkwrite(new_pte);
-			mark_page_dirty(kvm, gfn);
-		}
-
-		if (needs_exec)
-			new_pte = kvm_s2pte_mkexec(new_pte);
-
-		ret = stage2_set_pte(mmu, memcache, fault_ipa, &new_pte, flags);
+		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
+					     __pfn_to_phys(pfn), prot,
+					     memcache);
 	}
 
 out_unlock:

From 063deeb1f2c234ca5e9d156c7a86825fc5377477 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:26 +0100
Subject: [PATCH 18/87] KVM: arm64: Check the pgt instead of the pgd when
 modifying page-table

In preparation for removing the 'pgd' field of 'struct kvm_s2_mmu',
update the few remaining users to check the 'pgt' field instead.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-19-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index e980e614dd81..ed5aefd5d0a7 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1829,7 +1829,7 @@ static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *dat
 int kvm_unmap_hva_range(struct kvm *kvm,
 			unsigned long start, unsigned long end, unsigned flags)
 {
-	if (!kvm->arch.mmu.pgd)
+	if (!kvm->arch.mmu.pgt)
 		return 0;
 
 	trace_kvm_unmap_hva_range(start, end);
@@ -1892,7 +1892,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *
 
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 {
-	if (!kvm->arch.mmu.pgd)
+	if (!kvm->arch.mmu.pgt)
 		return 0;
 	trace_kvm_age_hva(start, end);
 	return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
@@ -1900,7 +1900,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
 
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 {
-	if (!kvm->arch.mmu.pgd)
+	if (!kvm->arch.mmu.pgt)
 		return 0;
 	trace_kvm_test_age_hva(hva);
 	return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,

From 3f26ab58e3272ceada11279c2e90894d4995cf79 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:27 +0100
Subject: [PATCH 19/87] KVM: arm64: Remove unused page-table code

Now that KVM is using the generic page-table code to manage the guest
stage-2 page-tables, we can remove a bunch of unused macros, #defines
and static inline functions from the old implementation.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-20-will@kernel.org
---
 arch/arm64/include/asm/kvm_mmu.h        | 171 ------
 arch/arm64/include/asm/pgtable-hwdef.h  |  18 -
 arch/arm64/include/asm/pgtable-prot.h   |  13 -
 arch/arm64/include/asm/stage2_pgtable.h | 215 -------
 arch/arm64/kvm/mmu.c                    | 755 ------------------------
 5 files changed, 1172 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 42fb50cfe0d8..c490fe8089b3 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -107,12 +107,6 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v)
 #define kvm_phys_size(kvm)		(_AC(1, ULL) << kvm_phys_shift(kvm))
 #define kvm_phys_mask(kvm)		(kvm_phys_size(kvm) - _AC(1, ULL))
 
-static inline bool kvm_page_empty(void *ptr)
-{
-	struct page *ptr_page = virt_to_page(ptr);
-	return page_count(ptr_page) == 1;
-}
-
 #include <asm/kvm_pgtable.h>
 #include <asm/stage2_pgtable.h>
 
@@ -135,123 +129,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
 phys_addr_t kvm_mmu_get_httbr(void);
 phys_addr_t kvm_get_idmap_vector(void);
 int kvm_mmu_init(void);
-#define kvm_mk_pmd(ptep)					\
-	__pmd(__phys_to_pmd_val(__pa(ptep)) | PMD_TYPE_TABLE)
-#define kvm_mk_pud(pmdp)					\
-	__pud(__phys_to_pud_val(__pa(pmdp)) | PMD_TYPE_TABLE)
-#define kvm_mk_p4d(pmdp)					\
-	__p4d(__phys_to_p4d_val(__pa(pmdp)) | PUD_TYPE_TABLE)
-
-#define kvm_set_pud(pudp, pud)		set_pud(pudp, pud)
-
-#define kvm_pfn_pte(pfn, prot)		pfn_pte(pfn, prot)
-#define kvm_pfn_pmd(pfn, prot)		pfn_pmd(pfn, prot)
-#define kvm_pfn_pud(pfn, prot)		pfn_pud(pfn, prot)
-
-#define kvm_pud_pfn(pud)		pud_pfn(pud)
-
-#define kvm_pmd_mkhuge(pmd)		pmd_mkhuge(pmd)
-#define kvm_pud_mkhuge(pud)		pud_mkhuge(pud)
-
-static inline pte_t kvm_s2pte_mkwrite(pte_t pte)
-{
-	pte_val(pte) |= PTE_S2_RDWR;
-	return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd)
-{
-	pmd_val(pmd) |= PMD_S2_RDWR;
-	return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkwrite(pud_t pud)
-{
-	pud_val(pud) |= PUD_S2_RDWR;
-	return pud;
-}
-
-static inline pte_t kvm_s2pte_mkexec(pte_t pte)
-{
-	pte_val(pte) &= ~PTE_S2_XN;
-	return pte;
-}
-
-static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd)
-{
-	pmd_val(pmd) &= ~PMD_S2_XN;
-	return pmd;
-}
-
-static inline pud_t kvm_s2pud_mkexec(pud_t pud)
-{
-	pud_val(pud) &= ~PUD_S2_XN;
-	return pud;
-}
-
-static inline void kvm_set_s2pte_readonly(pte_t *ptep)
-{
-	pteval_t old_pteval, pteval;
-
-	pteval = READ_ONCE(pte_val(*ptep));
-	do {
-		old_pteval = pteval;
-		pteval &= ~PTE_S2_RDWR;
-		pteval |= PTE_S2_RDONLY;
-		pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
-	} while (pteval != old_pteval);
-}
-
-static inline bool kvm_s2pte_readonly(pte_t *ptep)
-{
-	return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY;
-}
-
-static inline bool kvm_s2pte_exec(pte_t *ptep)
-{
-	return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN);
-}
-
-static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp)
-{
-	kvm_set_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_readonly(pmd_t *pmdp)
-{
-	return kvm_s2pte_readonly((pte_t *)pmdp);
-}
-
-static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
-{
-	return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
-}
-
-static inline void kvm_set_s2pud_readonly(pud_t *pudp)
-{
-	kvm_set_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_readonly(pud_t *pudp)
-{
-	return kvm_s2pte_readonly((pte_t *)pudp);
-}
-
-static inline bool kvm_s2pud_exec(pud_t *pudp)
-{
-	return !(READ_ONCE(pud_val(*pudp)) & PUD_S2_XN);
-}
-
-static inline pud_t kvm_s2pud_mkyoung(pud_t pud)
-{
-	return pud_mkyoung(pud);
-}
-
-static inline bool kvm_s2pud_young(pud_t pud)
-{
-	return pud_young(pud);
-}
-
 
 struct kvm;
 
@@ -293,30 +170,6 @@ static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn,
 	}
 }
 
-static inline void __kvm_flush_dcache_pte(pte_t pte)
-{
-	if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
-		struct page *page = pte_page(pte);
-		kvm_flush_dcache_to_poc(page_address(page), PAGE_SIZE);
-	}
-}
-
-static inline void __kvm_flush_dcache_pmd(pmd_t pmd)
-{
-	if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
-		struct page *page = pmd_page(pmd);
-		kvm_flush_dcache_to_poc(page_address(page), PMD_SIZE);
-	}
-}
-
-static inline void __kvm_flush_dcache_pud(pud_t pud)
-{
-	if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
-		struct page *page = pud_page(pud);
-		kvm_flush_dcache_to_poc(page_address(page), PUD_SIZE);
-	}
-}
-
 void kvm_set_way_flush(struct kvm_vcpu *vcpu);
 void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);
 
@@ -477,30 +330,6 @@ static inline int hyp_map_aux_data(void)
 
 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
 
-/*
- * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
- * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
- * 52bit IPS.
- */
-static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
-{
-	int x = ARM64_VTTBR_X(ipa_shift, levels);
-
-	return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
-}
-
-static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
-{
-	unsigned int x = arm64_vttbr_x(ipa_shift, levels);
-
-	return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
-}
-
-static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
-{
-	return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
-}
-
 static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
 {
 	struct kvm_vmid *vmid = &mmu->vmid;
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 1a989353144e..2b5f822fb033 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -156,7 +156,6 @@
 #define PTE_CONT		(_AT(pteval_t, 1) << 52)	/* Contiguous range */
 #define PTE_PXN			(_AT(pteval_t, 1) << 53)	/* Privileged XN */
 #define PTE_UXN			(_AT(pteval_t, 1) << 54)	/* User XN */
-#define PTE_HYP_XN		(_AT(pteval_t, 1) << 54)	/* HYP XN */
 
 #define PTE_ADDR_LOW		(((_AT(pteval_t, 1) << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
 #ifdef CONFIG_ARM64_PA_BITS_52
@@ -172,23 +171,6 @@
 #define PTE_ATTRINDX(t)		(_AT(pteval_t, (t)) << 2)
 #define PTE_ATTRINDX_MASK	(_AT(pteval_t, 7) << 2)
 
-/*
- * 2nd stage PTE definitions
- */
-#define PTE_S2_RDONLY		(_AT(pteval_t, 1) << 6)   /* HAP[2:1] */
-#define PTE_S2_RDWR		(_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
-#define PTE_S2_XN		(_AT(pteval_t, 2) << 53)  /* XN[1:0] */
-#define PTE_S2_SW_RESVD		(_AT(pteval_t, 15) << 55) /* Reserved for SW */
-
-#define PMD_S2_RDONLY		(_AT(pmdval_t, 1) << 6)   /* HAP[2:1] */
-#define PMD_S2_RDWR		(_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
-#define PMD_S2_XN		(_AT(pmdval_t, 2) << 53)  /* XN[1:0] */
-#define PMD_S2_SW_RESVD		(_AT(pmdval_t, 15) << 55) /* Reserved for SW */
-
-#define PUD_S2_RDONLY		(_AT(pudval_t, 1) << 6)   /* HAP[2:1] */
-#define PUD_S2_RDWR		(_AT(pudval_t, 3) << 6)   /* HAP[2:1] */
-#define PUD_S2_XN		(_AT(pudval_t, 2) << 53)  /* XN[1:0] */
-
 /*
  * Memory Attribute override for Stage-2 (MemAttr[3:0])
  */
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 88acd7e1cd05..8f094c43072a 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -73,19 +73,6 @@ extern bool arm64_use_ng_mappings;
 		__val;							\
 	 })
 
-#define PAGE_S2_XN							\
-	({								\
-		u64 __val;						\
-		if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))		\
-			__val = 0;					\
-		else							\
-			__val = PTE_S2_XN;				\
-		__val;							\
-	})
-
-#define PAGE_S2			__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(NORMAL) | PTE_S2_RDONLY | PAGE_S2_XN)
-#define PAGE_S2_DEVICE		__pgprot(_PROT_DEFAULT | PAGE_S2_MEMATTR(DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_S2_XN)
-
 #define PAGE_NONE		__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
 /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
 #define PAGE_SHARED		__pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index 996bf98f0cab..fe341a6578c3 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -8,7 +8,6 @@
 #ifndef __ARM64_S2_PGTABLE_H_
 #define __ARM64_S2_PGTABLE_H_
 
-#include <linux/hugetlb.h>
 #include <linux/pgtable.h>
 
 /*
@@ -36,21 +35,6 @@
 #define stage2_pgdir_size(kvm)		(1ULL << stage2_pgdir_shift(kvm))
 #define stage2_pgdir_mask(kvm)		~(stage2_pgdir_size(kvm) - 1)
 
-/*
- * The number of PTRS across all concatenated stage2 tables given by the
- * number of bits resolved at the initial level.
- * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
- * in which case, stage2_pgd_ptrs will have one entry.
- */
-#define pgd_ptrs_shift(ipa, pgdir_shift)	\
-	((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
-#define __s2_pgd_ptrs(ipa, lvls)		\
-	(1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
-#define __s2_pgd_size(ipa, lvls)	(__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
-
-#define stage2_pgd_ptrs(kvm)		__s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-#define stage2_pgd_size(kvm)		__s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
-
 /*
  * kvm_mmmu_cache_min_pages() is the number of pages required to install
  * a stage-2 translation. We pre-allocate the entry level page table at
@@ -58,196 +42,6 @@
  */
 #define kvm_mmu_cache_min_pages(kvm)	(kvm_stage2_levels(kvm) - 1)
 
-/* Stage2 PUD definitions when the level is present */
-static inline bool kvm_stage2_has_pud(struct kvm *kvm)
-{
-	return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
-}
-
-#define S2_PUD_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE			(1UL << S2_PUD_SHIFT)
-#define S2_PUD_MASK			(~(S2_PUD_SIZE - 1))
-
-#define stage2_pgd_none(kvm, pgd)		pgd_none(pgd)
-#define stage2_pgd_clear(kvm, pgd)		pgd_clear(pgd)
-#define stage2_pgd_present(kvm, pgd)		pgd_present(pgd)
-#define stage2_pgd_populate(kvm, pgd, p4d)	pgd_populate(NULL, pgd, p4d)
-
-static inline p4d_t *stage2_p4d_offset(struct kvm *kvm,
-				       pgd_t *pgd, unsigned long address)
-{
-	return p4d_offset(pgd, address);
-}
-
-static inline void stage2_p4d_free(struct kvm *kvm, p4d_t *p4d)
-{
-}
-
-static inline bool stage2_p4d_table_empty(struct kvm *kvm, p4d_t *p4dp)
-{
-	return false;
-}
-
-static inline phys_addr_t stage2_p4d_addr_end(struct kvm *kvm,
-					      phys_addr_t addr, phys_addr_t end)
-{
-	return end;
-}
-
-static inline bool stage2_p4d_none(struct kvm *kvm, p4d_t p4d)
-{
-	if (kvm_stage2_has_pud(kvm))
-		return p4d_none(p4d);
-	else
-		return 0;
-}
-
-static inline void stage2_p4d_clear(struct kvm *kvm, p4d_t *p4dp)
-{
-	if (kvm_stage2_has_pud(kvm))
-		p4d_clear(p4dp);
-}
-
-static inline bool stage2_p4d_present(struct kvm *kvm, p4d_t p4d)
-{
-	if (kvm_stage2_has_pud(kvm))
-		return p4d_present(p4d);
-	else
-		return 1;
-}
-
-static inline void stage2_p4d_populate(struct kvm *kvm, p4d_t *p4d, pud_t *pud)
-{
-	if (kvm_stage2_has_pud(kvm))
-		p4d_populate(NULL, p4d, pud);
-}
-
-static inline pud_t *stage2_pud_offset(struct kvm *kvm,
-				       p4d_t *p4d, unsigned long address)
-{
-	if (kvm_stage2_has_pud(kvm))
-		return pud_offset(p4d, address);
-	else
-		return (pud_t *)p4d;
-}
-
-static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
-{
-	if (kvm_stage2_has_pud(kvm))
-		free_page((unsigned long)pud);
-}
-
-static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
-{
-	if (kvm_stage2_has_pud(kvm))
-		return kvm_page_empty(pudp);
-	else
-		return false;
-}
-
-static inline phys_addr_t
-stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
-	if (kvm_stage2_has_pud(kvm)) {
-		phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
-
-		return (boundary - 1 < end - 1) ? boundary : end;
-	} else {
-		return end;
-	}
-}
-
-/* Stage2 PMD definitions when the level is present */
-static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
-{
-	return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
-}
-
-#define S2_PMD_SHIFT			ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE			(1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK			(~(S2_PMD_SIZE - 1))
-
-static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
-{
-	if (kvm_stage2_has_pmd(kvm))
-		return pud_none(pud);
-	else
-		return 0;
-}
-
-static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
-{
-	if (kvm_stage2_has_pmd(kvm))
-		pud_clear(pud);
-}
-
-static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
-{
-	if (kvm_stage2_has_pmd(kvm))
-		return pud_present(pud);
-	else
-		return 1;
-}
-
-static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
-{
-	if (kvm_stage2_has_pmd(kvm))
-		pud_populate(NULL, pud, pmd);
-}
-
-static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
-				       pud_t *pud, unsigned long address)
-{
-	if (kvm_stage2_has_pmd(kvm))
-		return pmd_offset(pud, address);
-	else
-		return (pmd_t *)pud;
-}
-
-static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
-{
-	if (kvm_stage2_has_pmd(kvm))
-		free_page((unsigned long)pmd);
-}
-
-static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
-{
-	if (kvm_stage2_has_pmd(kvm))
-		return pud_huge(pud);
-	else
-		return 0;
-}
-
-static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
-{
-	if (kvm_stage2_has_pmd(kvm))
-		return kvm_page_empty(pmdp);
-	else
-		return 0;
-}
-
-static inline phys_addr_t
-stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
-{
-	if (kvm_stage2_has_pmd(kvm)) {
-		phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
-
-		return (boundary - 1 < end - 1) ? boundary : end;
-	} else {
-		return end;
-	}
-}
-
-static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
-{
-	return kvm_page_empty(ptep);
-}
-
-static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
-{
-	return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
-}
-
 static inline phys_addr_t
 stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 {
@@ -256,13 +50,4 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 	return (boundary - 1 < end - 1) ? boundary : end;
 }
 
-/*
- * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
- * the architectural page-table level.
- */
-#define S2_NO_LEVEL_HINT	0
-#define S2_PUD_LEVEL		1
-#define S2_PMD_LEVEL		2
-#define S2_PTE_LEVEL		3
-
 #endif	/* __ARM64_S2_PGTABLE_H_ */
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index ed5aefd5d0a7..dd0ea7d8065a 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -31,13 +31,6 @@ static phys_addr_t hyp_idmap_vector;
 
 static unsigned long io_map_base;
 
-#define KVM_S2PTE_FLAG_IS_IOMAP		(1UL << 0)
-#define KVM_S2_FLAG_LOGGING_ACTIVE	(1UL << 1)
-
-static bool is_iomap(unsigned long flags)
-{
-	return flags & KVM_S2PTE_FLAG_IS_IOMAP;
-}
 
 /*
  * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
@@ -90,154 +83,11 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 	kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
 }
 
-static void kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
-				   int level)
-{
-	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ipa, level);
-}
-
-/*
- * D-Cache management functions. They take the page table entries by
- * value, as they are flushing the cache using the kernel mapping (or
- * kmap on 32bit).
- */
-static void kvm_flush_dcache_pte(pte_t pte)
-{
-	__kvm_flush_dcache_pte(pte);
-}
-
-static void kvm_flush_dcache_pmd(pmd_t pmd)
-{
-	__kvm_flush_dcache_pmd(pmd);
-}
-
-static void kvm_flush_dcache_pud(pud_t pud)
-{
-	__kvm_flush_dcache_pud(pud);
-}
-
 static bool kvm_is_device_pfn(unsigned long pfn)
 {
 	return !pfn_valid(pfn);
 }
 
-/**
- * stage2_dissolve_pmd() - clear and flush huge PMD entry
- * @mmu:	pointer to mmu structure to operate on
- * @addr:	IPA
- * @pmd:	pmd pointer for IPA
- *
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pmd(struct kvm_s2_mmu *mmu, phys_addr_t addr, pmd_t *pmd)
-{
-	if (!pmd_thp_or_huge(*pmd))
-		return;
-
-	pmd_clear(pmd);
-	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-	put_page(virt_to_page(pmd));
-}
-
-/**
- * stage2_dissolve_pud() - clear and flush huge PUD entry
- * @mmu:	pointer to mmu structure to operate on
- * @addr:	IPA
- * @pud:	pud pointer for IPA
- *
- * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
- */
-static void stage2_dissolve_pud(struct kvm_s2_mmu *mmu, phys_addr_t addr, pud_t *pudp)
-{
-	struct kvm *kvm = mmu->kvm;
-
-	if (!stage2_pud_huge(kvm, *pudp))
-		return;
-
-	stage2_pud_clear(kvm, pudp);
-	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
-	put_page(virt_to_page(pudp));
-}
-
-static void clear_stage2_pgd_entry(struct kvm_s2_mmu *mmu, pgd_t *pgd, phys_addr_t addr)
-{
-	struct kvm *kvm = mmu->kvm;
-	p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL);
-	stage2_pgd_clear(kvm, pgd);
-	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
-	stage2_p4d_free(kvm, p4d_table);
-	put_page(virt_to_page(pgd));
-}
-
-static void clear_stage2_p4d_entry(struct kvm_s2_mmu *mmu, p4d_t *p4d, phys_addr_t addr)
-{
-	struct kvm *kvm = mmu->kvm;
-	pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, p4d, 0);
-	stage2_p4d_clear(kvm, p4d);
-	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
-	stage2_pud_free(kvm, pud_table);
-	put_page(virt_to_page(p4d));
-}
-
-static void clear_stage2_pud_entry(struct kvm_s2_mmu *mmu, pud_t *pud, phys_addr_t addr)
-{
-	struct kvm *kvm = mmu->kvm;
-	pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0);
-
-	VM_BUG_ON(stage2_pud_huge(kvm, *pud));
-	stage2_pud_clear(kvm, pud);
-	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
-	stage2_pmd_free(kvm, pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_stage2_pmd_entry(struct kvm_s2_mmu *mmu, pmd_t *pmd, phys_addr_t addr)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	VM_BUG_ON(pmd_thp_or_huge(*pmd));
-	pmd_clear(pmd);
-	kvm_tlb_flush_vmid_ipa(mmu, addr, S2_NO_LEVEL_HINT);
-	free_page((unsigned long)pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-static inline void kvm_set_pte(pte_t *ptep, pte_t new_pte)
-{
-	WRITE_ONCE(*ptep, new_pte);
-	dsb(ishst);
-}
-
-static inline void kvm_set_pmd(pmd_t *pmdp, pmd_t new_pmd)
-{
-	WRITE_ONCE(*pmdp, new_pmd);
-	dsb(ishst);
-}
-
-static inline void kvm_pmd_populate(pmd_t *pmdp, pte_t *ptep)
-{
-	kvm_set_pmd(pmdp, kvm_mk_pmd(ptep));
-}
-
-static inline void kvm_pud_populate(pud_t *pudp, pmd_t *pmdp)
-{
-	WRITE_ONCE(*pudp, kvm_mk_pud(pmdp));
-	dsb(ishst);
-}
-
-static inline void kvm_p4d_populate(p4d_t *p4dp, pud_t *pudp)
-{
-	WRITE_ONCE(*p4dp, kvm_mk_p4d(pudp));
-	dsb(ishst);
-}
-
-static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
-{
-#ifndef __PAGETABLE_P4D_FOLDED
-	WRITE_ONCE(*pgdp, kvm_mk_pgd(p4dp));
-	dsb(ishst);
-#endif
-}
-
 /*
  * Unmapping vs dcache management:
  *
@@ -262,108 +112,6 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp)
  * we then fully enforce cacheability of RAM, no matter what the guest
  * does.
  */
-static void unmap_stage2_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	phys_addr_t start_addr = addr;
-	pte_t *pte, *start_pte;
-
-	start_pte = pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			pte_t old_pte = *pte;
-
-			kvm_set_pte(pte, __pte(0));
-			kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
-
-			/* No need to invalidate the cache for device mappings */
-			if (!kvm_is_device_pfn(pte_pfn(old_pte)))
-				kvm_flush_dcache_pte(old_pte);
-
-			put_page(virt_to_page(pte));
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-
-	if (stage2_pte_table_empty(mmu->kvm, start_pte))
-		clear_stage2_pmd_entry(mmu, pmd, start_addr);
-}
-
-static void unmap_stage2_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	phys_addr_t next, start_addr = addr;
-	pmd_t *pmd, *start_pmd;
-
-	start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr);
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
-				pmd_t old_pmd = *pmd;
-
-				pmd_clear(pmd);
-				kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-
-				kvm_flush_dcache_pmd(old_pmd);
-
-				put_page(virt_to_page(pmd));
-			} else {
-				unmap_stage2_ptes(mmu, pmd, addr, next);
-			}
-		}
-	} while (pmd++, addr = next, addr != end);
-
-	if (stage2_pmd_table_empty(kvm, start_pmd))
-		clear_stage2_pud_entry(mmu, pud, start_addr);
-}
-
-static void unmap_stage2_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	phys_addr_t next, start_addr = addr;
-	pud_t *pud, *start_pud;
-
-	start_pud = pud = stage2_pud_offset(kvm, p4d, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud)) {
-				pud_t old_pud = *pud;
-
-				stage2_pud_clear(kvm, pud);
-				kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
-				kvm_flush_dcache_pud(old_pud);
-				put_page(virt_to_page(pud));
-			} else {
-				unmap_stage2_pmds(mmu, pud, addr, next);
-			}
-		}
-	} while (pud++, addr = next, addr != end);
-
-	if (stage2_pud_table_empty(kvm, start_pud))
-		clear_stage2_p4d_entry(mmu, p4d, start_addr);
-}
-
-static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
-		       phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	phys_addr_t next, start_addr = addr;
-	p4d_t *p4d, *start_p4d;
-
-	start_p4d = p4d = stage2_p4d_offset(kvm, pgd, addr);
-	do {
-		next = stage2_p4d_addr_end(kvm, addr, end);
-		if (!stage2_p4d_none(kvm, *p4d))
-			unmap_stage2_puds(mmu, p4d, addr, next);
-	} while (p4d++, addr = next, addr != end);
-
-	if (stage2_p4d_table_empty(kvm, start_p4d))
-		clear_stage2_pgd_entry(mmu, pgd, start_addr);
-}
-
 /**
  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
  * @kvm:   The VM pointer
@@ -392,71 +140,6 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
 	__unmap_stage2_range(mmu, start, size, true);
 }
 
-static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
-			kvm_flush_dcache_pte(*pte);
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void stage2_flush_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	pmd_t *pmd;
-	phys_addr_t next;
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd))
-				kvm_flush_dcache_pmd(*pmd);
-			else
-				stage2_flush_ptes(mmu, pmd, addr, next);
-		}
-	} while (pmd++, addr = next, addr != end);
-}
-
-static void stage2_flush_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	pud_t *pud;
-	phys_addr_t next;
-
-	pud = stage2_pud_offset(kvm, p4d, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud))
-				kvm_flush_dcache_pud(*pud);
-			else
-				stage2_flush_pmds(mmu, pud, addr, next);
-		}
-	} while (pud++, addr = next, addr != end);
-}
-
-static void stage2_flush_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
-			      phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	p4d_t *p4d;
-	phys_addr_t next;
-
-	p4d = stage2_p4d_offset(kvm, pgd, addr);
-	do {
-		next = stage2_p4d_addr_end(kvm, addr, end);
-		if (!stage2_p4d_none(kvm, *p4d))
-			stage2_flush_puds(mmu, p4d, addr, next);
-	} while (p4d++, addr = next, addr != end);
-}
-
 static void stage2_flush_memslot(struct kvm *kvm,
 				 struct kvm_memory_slot *memslot)
 {
@@ -805,348 +488,6 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 	}
 }
 
-static p4d_t *stage2_get_p4d(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	struct kvm *kvm = mmu->kvm;
-	pgd_t *pgd;
-	p4d_t *p4d;
-
-	pgd = mmu->pgd + stage2_pgd_index(kvm, addr);
-	if (stage2_pgd_none(kvm, *pgd)) {
-		if (!cache)
-			return NULL;
-		p4d = kvm_mmu_memory_cache_alloc(cache);
-		stage2_pgd_populate(kvm, pgd, p4d);
-		get_page(virt_to_page(pgd));
-	}
-
-	return stage2_p4d_offset(kvm, pgd, addr);
-}
-
-static pud_t *stage2_get_pud(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	struct kvm *kvm = mmu->kvm;
-	p4d_t *p4d;
-	pud_t *pud;
-
-	p4d = stage2_get_p4d(mmu, cache, addr);
-	if (stage2_p4d_none(kvm, *p4d)) {
-		if (!cache)
-			return NULL;
-		pud = kvm_mmu_memory_cache_alloc(cache);
-		stage2_p4d_populate(kvm, p4d, pud);
-		get_page(virt_to_page(p4d));
-	}
-
-	return stage2_pud_offset(kvm, p4d, addr);
-}
-
-static pmd_t *stage2_get_pmd(struct kvm_s2_mmu *mmu, struct kvm_mmu_memory_cache *cache,
-			     phys_addr_t addr)
-{
-	struct kvm *kvm = mmu->kvm;
-	pud_t *pud;
-	pmd_t *pmd;
-
-	pud = stage2_get_pud(mmu, cache, addr);
-	if (!pud || stage2_pud_huge(kvm, *pud))
-		return NULL;
-
-	if (stage2_pud_none(kvm, *pud)) {
-		if (!cache)
-			return NULL;
-		pmd = kvm_mmu_memory_cache_alloc(cache);
-		stage2_pud_populate(kvm, pud, pmd);
-		get_page(virt_to_page(pud));
-	}
-
-	return stage2_pmd_offset(kvm, pud, addr);
-}
-
-static int stage2_set_pmd_huge(struct kvm_s2_mmu *mmu,
-			       struct kvm_mmu_memory_cache *cache,
-			       phys_addr_t addr, const pmd_t *new_pmd)
-{
-	pmd_t *pmd, old_pmd;
-
-retry:
-	pmd = stage2_get_pmd(mmu, cache, addr);
-	VM_BUG_ON(!pmd);
-
-	old_pmd = *pmd;
-	/*
-	 * Multiple vcpus faulting on the same PMD entry, can
-	 * lead to them sequentially updating the PMD with the
-	 * same value. Following the break-before-make
-	 * (pmd_clear() followed by tlb_flush()) process can
-	 * hinder forward progress due to refaults generated
-	 * on missing translations.
-	 *
-	 * Skip updating the page table if the entry is
-	 * unchanged.
-	 */
-	if (pmd_val(old_pmd) == pmd_val(*new_pmd))
-		return 0;
-
-	if (pmd_present(old_pmd)) {
-		/*
-		 * If we already have PTE level mapping for this block,
-		 * we must unmap it to avoid inconsistent TLB state and
-		 * leaking the table page. We could end up in this situation
-		 * if the memory slot was marked for dirty logging and was
-		 * reverted, leaving PTE level mappings for the pages accessed
-		 * during the period. So, unmap the PTE level mapping for this
-		 * block and retry, as we could have released the upper level
-		 * table in the process.
-		 *
-		 * Normal THP split/merge follows mmu_notifier callbacks and do
-		 * get handled accordingly.
-		 */
-		if (!pmd_thp_or_huge(old_pmd)) {
-			unmap_stage2_range(mmu, addr & S2_PMD_MASK, S2_PMD_SIZE);
-			goto retry;
-		}
-		/*
-		 * Mapping in huge pages should only happen through a
-		 * fault.  If a page is merged into a transparent huge
-		 * page, the individual subpages of that huge page
-		 * should be unmapped through MMU notifiers before we
-		 * get here.
-		 *
-		 * Merging of CompoundPages is not supported; they
-		 * should become splitting first, unmapped, merged,
-		 * and mapped back in on-demand.
-		 */
-		WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-		pmd_clear(pmd);
-		kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PMD_LEVEL);
-	} else {
-		get_page(virt_to_page(pmd));
-	}
-
-	kvm_set_pmd(pmd, *new_pmd);
-	return 0;
-}
-
-static int stage2_set_pud_huge(struct kvm_s2_mmu *mmu,
-			       struct kvm_mmu_memory_cache *cache,
-			       phys_addr_t addr, const pud_t *new_pudp)
-{
-	struct kvm *kvm = mmu->kvm;
-	pud_t *pudp, old_pud;
-
-retry:
-	pudp = stage2_get_pud(mmu, cache, addr);
-	VM_BUG_ON(!pudp);
-
-	old_pud = *pudp;
-
-	/*
-	 * A large number of vcpus faulting on the same stage 2 entry,
-	 * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
-	 * Skip updating the page tables if there is no change.
-	 */
-	if (pud_val(old_pud) == pud_val(*new_pudp))
-		return 0;
-
-	if (stage2_pud_present(kvm, old_pud)) {
-		/*
-		 * If we already have table level mapping for this block, unmap
-		 * the range for this block and retry.
-		 */
-		if (!stage2_pud_huge(kvm, old_pud)) {
-			unmap_stage2_range(mmu, addr & S2_PUD_MASK, S2_PUD_SIZE);
-			goto retry;
-		}
-
-		WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
-		stage2_pud_clear(kvm, pudp);
-		kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PUD_LEVEL);
-	} else {
-		get_page(virt_to_page(pudp));
-	}
-
-	kvm_set_pud(pudp, *new_pudp);
-	return 0;
-}
-
-/*
- * stage2_get_leaf_entry - walk the stage2 VM page tables and return
- * true if a valid and present leaf-entry is found. A pointer to the
- * leaf-entry is returned in the appropriate level variable - pudpp,
- * pmdpp, ptepp.
- */
-static bool stage2_get_leaf_entry(struct kvm_s2_mmu *mmu, phys_addr_t addr,
-				  pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
-{
-	struct kvm *kvm = mmu->kvm;
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	*pudpp = NULL;
-	*pmdpp = NULL;
-	*ptepp = NULL;
-
-	pudp = stage2_get_pud(mmu, NULL, addr);
-	if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
-		return false;
-
-	if (stage2_pud_huge(kvm, *pudp)) {
-		*pudpp = pudp;
-		return true;
-	}
-
-	pmdp = stage2_pmd_offset(kvm, pudp, addr);
-	if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
-		return false;
-
-	if (pmd_thp_or_huge(*pmdp)) {
-		*pmdpp = pmdp;
-		return true;
-	}
-
-	ptep = pte_offset_kernel(pmdp, addr);
-	if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
-		return false;
-
-	*ptepp = ptep;
-	return true;
-}
-
-static bool stage2_is_exec(struct kvm_s2_mmu *mmu, phys_addr_t addr, unsigned long sz)
-{
-	pud_t *pudp;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	bool found;
-
-	found = stage2_get_leaf_entry(mmu, addr, &pudp, &pmdp, &ptep);
-	if (!found)
-		return false;
-
-	if (pudp)
-		return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
-	else if (pmdp)
-		return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
-	else
-		return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
-}
-
-static int stage2_set_pte(struct kvm_s2_mmu *mmu,
-			  struct kvm_mmu_memory_cache *cache,
-			  phys_addr_t addr, const pte_t *new_pte,
-			  unsigned long flags)
-{
-	struct kvm *kvm = mmu->kvm;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte, old_pte;
-	bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
-	bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
-
-	VM_BUG_ON(logging_active && !cache);
-
-	/* Create stage-2 page table mapping - Levels 0 and 1 */
-	pud = stage2_get_pud(mmu, cache, addr);
-	if (!pud) {
-		/*
-		 * Ignore calls from kvm_set_spte_hva for unallocated
-		 * address ranges.
-		 */
-		return 0;
-	}
-
-	/*
-	 * While dirty page logging - dissolve huge PUD, then continue
-	 * on to allocate page.
-	 */
-	if (logging_active)
-		stage2_dissolve_pud(mmu, addr, pud);
-
-	if (stage2_pud_none(kvm, *pud)) {
-		if (!cache)
-			return 0; /* ignore calls from kvm_set_spte_hva */
-		pmd = kvm_mmu_memory_cache_alloc(cache);
-		stage2_pud_populate(kvm, pud, pmd);
-		get_page(virt_to_page(pud));
-	}
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-	if (!pmd) {
-		/*
-		 * Ignore calls from kvm_set_spte_hva for unallocated
-		 * address ranges.
-		 */
-		return 0;
-	}
-
-	/*
-	 * While dirty page logging - dissolve huge PMD, then continue on to
-	 * allocate page.
-	 */
-	if (logging_active)
-		stage2_dissolve_pmd(mmu, addr, pmd);
-
-	/* Create stage-2 page mappings - Level 2 */
-	if (pmd_none(*pmd)) {
-		if (!cache)
-			return 0; /* ignore calls from kvm_set_spte_hva */
-		pte = kvm_mmu_memory_cache_alloc(cache);
-		kvm_pmd_populate(pmd, pte);
-		get_page(virt_to_page(pmd));
-	}
-
-	pte = pte_offset_kernel(pmd, addr);
-
-	if (iomap && pte_present(*pte))
-		return -EFAULT;
-
-	/* Create 2nd stage page table mapping - Level 3 */
-	old_pte = *pte;
-	if (pte_present(old_pte)) {
-		/* Skip page table update if there is no change */
-		if (pte_val(old_pte) == pte_val(*new_pte))
-			return 0;
-
-		kvm_set_pte(pte, __pte(0));
-		kvm_tlb_flush_vmid_ipa(mmu, addr, S2_PTE_LEVEL);
-	} else {
-		get_page(virt_to_page(pte));
-	}
-
-	kvm_set_pte(pte, *new_pte);
-	return 0;
-}
-
-#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-	if (pte_young(*pte)) {
-		*pte = pte_mkold(*pte);
-		return 1;
-	}
-	return 0;
-}
-#else
-static int stage2_ptep_test_and_clear_young(pte_t *pte)
-{
-	return __ptep_test_and_clear_young(pte);
-}
-#endif
-
-static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
-{
-	return stage2_ptep_test_and_clear_young((pte_t *)pmd);
-}
-
-static int stage2_pudp_test_and_clear_young(pud_t *pud)
-{
-	return stage2_ptep_test_and_clear_young((pte_t *)pud);
-}
-
 /**
  * kvm_phys_addr_ioremap - map a device range to guest IPA
  *
@@ -1189,102 +530,6 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 	return ret;
 }
 
-/**
- * stage2_wp_ptes - write protect PMD range
- * @pmd:	pointer to pmd entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
-{
-	pte_t *pte;
-
-	pte = pte_offset_kernel(pmd, addr);
-	do {
-		if (!pte_none(*pte)) {
-			if (!kvm_s2pte_readonly(pte))
-				kvm_set_s2pte_readonly(pte);
-		}
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-/**
- * stage2_wp_pmds - write protect PUD range
- * kvm:		kvm instance for the VM
- * @pud:	pointer to pud entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void stage2_wp_pmds(struct kvm_s2_mmu *mmu, pud_t *pud,
-			   phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	pmd_t *pmd;
-	phys_addr_t next;
-
-	pmd = stage2_pmd_offset(kvm, pud, addr);
-
-	do {
-		next = stage2_pmd_addr_end(kvm, addr, end);
-		if (!pmd_none(*pmd)) {
-			if (pmd_thp_or_huge(*pmd)) {
-				if (!kvm_s2pmd_readonly(pmd))
-					kvm_set_s2pmd_readonly(pmd);
-			} else {
-				stage2_wp_ptes(pmd, addr, next);
-			}
-		}
-	} while (pmd++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_puds - write protect P4D range
- * @p4d:	pointer to p4d entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void  stage2_wp_puds(struct kvm_s2_mmu *mmu, p4d_t *p4d,
-			    phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	pud_t *pud;
-	phys_addr_t next;
-
-	pud = stage2_pud_offset(kvm, p4d, addr);
-	do {
-		next = stage2_pud_addr_end(kvm, addr, end);
-		if (!stage2_pud_none(kvm, *pud)) {
-			if (stage2_pud_huge(kvm, *pud)) {
-				if (!kvm_s2pud_readonly(pud))
-					kvm_set_s2pud_readonly(pud);
-			} else {
-				stage2_wp_pmds(mmu, pud, addr, next);
-			}
-		}
-	} while (pud++, addr = next, addr != end);
-}
-
-/**
- * stage2_wp_p4ds - write protect PGD range
- * @pgd:	pointer to pgd entry
- * @addr:	range start address
- * @end:	range end address
- */
-static void  stage2_wp_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd,
-			    phys_addr_t addr, phys_addr_t end)
-{
-	struct kvm *kvm = mmu->kvm;
-	p4d_t *p4d;
-	phys_addr_t next;
-
-	p4d = stage2_p4d_offset(kvm, pgd, addr);
-	do {
-		next = stage2_p4d_addr_end(kvm, addr, end);
-		if (!stage2_p4d_none(kvm, *p4d))
-			stage2_wp_puds(mmu, p4d, addr, next);
-	} while (p4d++, addr = next, addr != end);
-}
-
 /**
  * stage2_wp_range() - write protect stage2 memory region range
  * @kvm:	The KVM pointer

From 74cfa7ea66a7e54a5ea1d8ad1fb74a5e832ed867 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:28 +0100
Subject: [PATCH 20/87] KVM: arm64: Remove unused 'pgd' field from 'struct
 kvm_s2_mmu'

The stage-2 page-tables are entirely encapsulated by the 'pgt' field of
'struct kvm_s2_mmu', so remove the unused 'pgd' field.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-21-will@kernel.org
---
 arch/arm64/include/asm/kvm_host.h | 1 -
 arch/arm64/kvm/mmu.c              | 2 --
 2 files changed, 3 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 0b7c702b2151..41caf29bd93c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -79,7 +79,6 @@ struct kvm_s2_mmu {
 	 * for vEL1/EL0 with vHCR_EL2.VM == 0.  In that case, we use the
 	 * canonical stage-2 page tables.
 	 */
-	pgd_t		*pgd;
 	phys_addr_t	pgd_phys;
 	struct kvm_pgtable *pgt;
 
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index dd0ea7d8065a..21b70abf65a7 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -389,7 +389,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
 	mmu->kvm = kvm;
 	mmu->pgt = pgt;
 	mmu->pgd_phys = __pa(pgt->pgd);
-	mmu->pgd = (void *)pgt->pgd;
 	mmu->vmid.vmid_gen = 0;
 	return 0;
 
@@ -475,7 +474,6 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
 	spin_lock(&kvm->mmu_lock);
 	pgt = mmu->pgt;
 	if (pgt) {
-		mmu->pgd = NULL;
 		mmu->pgd_phys = 0;
 		mmu->pgt = NULL;
 		free_percpu(mmu->last_vcpu_ran);

From c9b69a0cf0b4336fe7d2e35c46273debc68910fd Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 11 Sep 2020 14:25:29 +0100
Subject: [PATCH 21/87] KVM: arm64: Don't constrain maximum IPA size based on
 host configuration

Now that the guest stage-2 page-tables are managed independently from
the host stage-1 page-tables, we can avoid constraining the IPA size
based on the host and instead limit it only based on the PARange field
of the ID_AA64MMFR0 register.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Quentin Perret <qperret@google.com>
Link: https://lore.kernel.org/r/20200911132529.19844-22-will@kernel.org
---
 arch/arm64/kvm/reset.c | 40 ++++++----------------------------------
 1 file changed, 6 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index ee33875c5c2a..2202b710d44c 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -339,7 +339,7 @@ u32 get_kvm_ipa_limit(void)
 
 int kvm_set_ipa_limit(void)
 {
-	unsigned int ipa_max, pa_max, va_max, parange, tgran_2;
+	unsigned int parange, tgran_2;
 	u64 mmfr0;
 
 	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
@@ -376,39 +376,11 @@ int kvm_set_ipa_limit(void)
 		break;
 	}
 
-	pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
-
-	/* Clamp the IPA limit to the PA size supported by the kernel */
-	ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
-	/*
-	 * Since our stage2 table is dependent on the stage1 page table code,
-	 * we must always honor the following condition:
-	 *
-	 *  Number of levels in Stage1 >= Number of levels in Stage2.
-	 *
-	 * So clamp the ipa limit further down to limit the number of levels.
-	 * Since we can concatenate upto 16 tables at entry level, we could
-	 * go upto 4bits above the maximum VA addressable with the current
-	 * number of levels.
-	 */
-	va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
-	va_max += 4;
-
-	if (va_max < ipa_max)
-		ipa_max = va_max;
-
-	/*
-	 * If the final limit is lower than the real physical address
-	 * limit of the CPUs, report the reason.
-	 */
-	if (ipa_max < pa_max)
-		pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
-			(va_max < pa_max) ? "Virtual" : "Physical");
-
-	WARN(ipa_max < KVM_PHYS_SHIFT,
-	     "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
-	kvm_ipa_limit = ipa_max;
-	kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+	kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
+	WARN(kvm_ipa_limit < KVM_PHYS_SHIFT,
+	     "KVM IPA Size Limit (%d bits) is smaller than default size\n",
+	     kvm_ipa_limit);
+	kvm_info("IPA Size Limit: %d bits\n", kvm_ipa_limit);
 
 	return 0;
 }

From 501a67a25dd459a7d16b35ecdbe8a1ca5463e2bd Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:25 +0100
Subject: [PATCH 22/87] KVM: arm64: Remove __activate_vm wrapper

The __activate_vm wrapper serves no useful function and has a misleading
name as it simply calls __load_guest_stage2 and does not touch
HCR_EL2.VM so remove it.

Also rename __deactivate_vm to __load_host_stage2 to match naming
pattern.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-2-ascull@google.com
---
 arch/arm64/kvm/hyp/include/hyp/switch.h |  5 -----
 arch/arm64/kvm/hyp/nvhe/switch.c        |  8 ++++----
 arch/arm64/kvm/hyp/vhe/switch.c         | 10 +++++-----
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 5b6b8fa00f0a..0864f88bc840 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -126,11 +126,6 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
 	}
 }
 
-static inline void __activate_vm(struct kvm_s2_mmu *mmu)
-{
-	__load_guest_stage2(mmu);
-}
-
 static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
 {
 	u64 par, tmp;
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 0970442d2dbc..3c9c065b3264 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -93,7 +93,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
 	write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
 }
 
-static void __deactivate_vm(struct kvm_vcpu *vcpu)
+static void __load_host_stage2(void)
 {
 	write_sysreg(0, vttbr_el2);
 }
@@ -194,7 +194,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	__sysreg32_restore_state(vcpu);
 	__sysreg_restore_state_nvhe(guest_ctxt);
 
-	__activate_vm(kern_hyp_va(vcpu->arch.hw_mmu));
+	__load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu));
 	__activate_traps(vcpu);
 
 	__hyp_vgic_restore_state(vcpu);
@@ -219,7 +219,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	__hyp_vgic_save_state(vcpu);
 
 	__deactivate_traps(vcpu);
-	__deactivate_vm(vcpu);
+	__load_host_stage2();
 
 	__sysreg_restore_state_nvhe(host_ctxt);
 
@@ -253,7 +253,7 @@ void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
 	if (read_sysreg(vttbr_el2)) {
 		__timer_disable_traps(vcpu);
 		__deactivate_traps(vcpu);
-		__deactivate_vm(vcpu);
+		__load_host_stage2();
 		__sysreg_restore_state_nvhe(host_ctxt);
 	}
 
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index c1da4f86ccac..6636522a8529 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -120,12 +120,12 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 	 * HCR_EL2.TGE.
 	 *
 	 * We have already configured the guest's stage 1 translation in
-	 * kvm_vcpu_load_sysregs_vhe above.  We must now call __activate_vm
-	 * before __activate_traps, because __activate_vm configures
-	 * stage 2 translation, and __activate_traps clear HCR_EL2.TGE
-	 * (among other things).
+	 * kvm_vcpu_load_sysregs_vhe above.  We must now call
+	 * __load_guest_stage2 before __activate_traps, because
+	 * __load_guest_stage2 configures stage 2 translation, and
+	 * __activate_traps clear HCR_EL2.TGE (among other things).
 	 */
-	__activate_vm(vcpu->arch.hw_mmu);
+	__load_guest_stage2(vcpu->arch.hw_mmu);
 	__activate_traps(vcpu);
 
 	sysreg_restore_guest_state_vhe(guest_ctxt);

From 6a0259ed29bba83653a36fabcdf6b06aecd78596 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:26 +0100
Subject: [PATCH 23/87] KVM: arm64: Remove hyp_panic arguments

hyp_panic is able to find all the context it needs from within itself so
remove the argument. The __hyp_panic wrapper becomes redundant so is
also removed.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-3-ascull@google.com
---
 arch/arm64/include/asm/kvm_hyp.h        |  2 +-
 arch/arm64/kvm/hyp/hyp-entry.S          |  7 +------
 arch/arm64/kvm/hyp/include/hyp/switch.h |  4 +---
 arch/arm64/kvm/hyp/nvhe/switch.c        |  8 ++++++--
 arch/arm64/kvm/hyp/vhe/switch.c         | 10 ++++++----
 5 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 46689e7db46c..3de99b323061 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -89,7 +89,7 @@ void deactivate_traps_vhe_put(void);
 
 u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
 
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt);
+void __noreturn hyp_panic(void);
 #ifdef __KVM_NVHE_HYPERVISOR__
 void __noreturn __hyp_do_panic(unsigned long, ...);
 #endif
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 46b4dab933d0..9cb3fbca5d79 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -210,12 +210,7 @@ SYM_FUNC_START(__hyp_do_panic)
 SYM_FUNC_END(__hyp_do_panic)
 #endif
 
-SYM_CODE_START(__hyp_panic)
-	get_host_ctxt x0, x1
-	b	hyp_panic
-SYM_CODE_END(__hyp_panic)
-
-.macro invalid_vector	label, target = __hyp_panic
+.macro invalid_vector	label, target = hyp_panic
 	.align	2
 SYM_CODE_START(\label)
 	b \target
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 0864f88bc840..96ea3fdd0c20 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -510,13 +510,11 @@ static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
 static inline void __kvm_unexpected_el2_exception(void)
 {
 	unsigned long addr, fixup;
-	struct kvm_cpu_context *host_ctxt;
 	struct exception_table_entry *entry, *end;
 	unsigned long elr_el2 = read_sysreg(elr_el2);
 
 	entry = hyp_symbol_addr(__start___kvm_ex_table);
 	end = hyp_symbol_addr(__stop___kvm_ex_table);
-	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
 
 	while (entry < end) {
 		addr = (unsigned long)&entry->insn + entry->insn;
@@ -531,7 +529,7 @@ static inline void __kvm_unexpected_el2_exception(void)
 		return;
 	}
 
-	hyp_panic(host_ctxt);
+	hyp_panic();
 }
 
 #endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 3c9c065b3264..26d6fd4b6f4d 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -242,14 +242,18 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	return exit_code;
 }
 
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
 {
 	u64 spsr = read_sysreg_el2(SYS_SPSR);
 	u64 elr = read_sysreg_el2(SYS_ELR);
 	u64 par = read_sysreg(par_el1);
-	struct kvm_vcpu *vcpu = host_ctxt->__hyp_running_vcpu;
+	struct kvm_cpu_context *host_ctxt;
+	struct kvm_vcpu *vcpu;
 	unsigned long str_va;
 
+	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	vcpu = host_ctxt->__hyp_running_vcpu;
+
 	if (read_sysreg(vttbr_el2)) {
 		__timer_disable_traps(vcpu);
 		__deactivate_traps(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 6636522a8529..c1fd47557713 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -192,10 +192,12 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
-			     struct kvm_cpu_context *host_ctxt)
+static void __hyp_call_panic(u64 spsr, u64 elr, u64 par)
 {
+	struct kvm_cpu_context *host_ctxt;
 	struct kvm_vcpu *vcpu;
+
+	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
 	vcpu = host_ctxt->__hyp_running_vcpu;
 
 	__deactivate_traps(vcpu);
@@ -208,13 +210,13 @@ static void __hyp_call_panic(u64 spsr, u64 elr, u64 par,
 }
 NOKPROBE_SYMBOL(__hyp_call_panic);
 
-void __noreturn hyp_panic(struct kvm_cpu_context *host_ctxt)
+void __noreturn hyp_panic(void)
 {
 	u64 spsr = read_sysreg_el2(SYS_SPSR);
 	u64 elr = read_sysreg_el2(SYS_ELR);
 	u64 par = read_sysreg(par_el1);
 
-	__hyp_call_panic(spsr, elr, par, host_ctxt);
+	__hyp_call_panic(spsr, elr, par);
 	unreachable();
 }
 

From d7ca1079d8ea897a8c45c9f78693972e28859056 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:27 +0100
Subject: [PATCH 24/87] KVM: arm64: Remove kvm_host_data_t typedef

The kvm_host_data_t typedef is used inconsistently and goes against the
kernel's coding style. Remove it in favour of the full struct specifier.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-4-ascull@google.com
---
 arch/arm64/include/asm/kvm_host.h | 4 +---
 arch/arm64/kvm/arm.c              | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e52c927aade5..16adbefde1cc 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -262,8 +262,6 @@ struct kvm_host_data {
 	struct kvm_pmu_events pmu_events;
 };
 
-typedef struct kvm_host_data kvm_host_data_t;
-
 struct vcpu_reset_state {
 	unsigned long	pc;
 	unsigned long	r0;
@@ -565,7 +563,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
-DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data);
+DECLARE_PER_CPU(struct kvm_host_data, kvm_host_data);
 
 static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
 {
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46dc3d75cf13..1af4c77feda2 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -46,7 +46,7 @@
 __asm__(".arch_extension	virt");
 #endif
 
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
 
 /* The VMID used in the VTTBR */
@@ -1538,7 +1538,7 @@ static int init_hyp_mode(void)
 	}
 
 	for_each_possible_cpu(cpu) {
-		kvm_host_data_t *cpu_data;
+		struct kvm_host_data *cpu_data;
 
 		cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
 		err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);

From ceee2fe4ba1f0b9dd388fd1030bea0162ca66a40 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:28 +0100
Subject: [PATCH 25/87] KVM: arm64: Choose hyp symbol based on context

Make CHOOSE_HYP_SYM select the symbol of the active hypervisor for the
host, the nVHE symbol for nVHE and the VHE symbol for VHE. The nVHE and
VHE hypervisors see their own symbols without prefixes and trigger a
link error when trying to use a symbol of the other hypervisor.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: David Brazdil <dbrazdil@google.com>
Link: https://lore.kernel.org/r/20200915104643.2543892-5-ascull@google.com
---
 arch/arm64/include/asm/kvm_asm.h | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 6f98fbd0ac81..a952859117b2 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -60,10 +60,24 @@
 	DECLARE_KVM_VHE_SYM(sym);		\
 	DECLARE_KVM_NVHE_SYM(sym)
 
-#define CHOOSE_VHE_SYM(sym)	sym
-#define CHOOSE_NVHE_SYM(sym)	kvm_nvhe_sym(sym)
+#if defined(__KVM_NVHE_HYPERVISOR__)
+
+#define CHOOSE_HYP_SYM(sym)	CHOOSE_NVHE_SYM(sym)
+#define CHOOSE_NVHE_SYM(sym)	sym
+/* The nVHE hypervisor shouldn't even try to access VHE symbols */
+extern void *__nvhe_undefined_symbol;
+#define CHOOSE_VHE_SYM(sym)	__nvhe_undefined_symbol
+
+#elif defined(__KVM_VHE_HYPERVISOR)
+
+#define CHOOSE_HYP_SYM(sym)	CHOOSE_VHE_SYM(sym)
+#define CHOOSE_VHE_SYM(sym)	sym
+/* The VHE hypervisor shouldn't even try to access nVHE symbols */
+extern void *__vhe_undefined_symbol;
+#define CHOOSE_NVHE_SYM(sym)	__vhe_undefined_symbol
+
+#else
 
-#ifndef __KVM_NVHE_HYPERVISOR__
 /*
  * BIG FAT WARNINGS:
  *
@@ -77,10 +91,9 @@
  */
 #define CHOOSE_HYP_SYM(sym)	(is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \
 					   : CHOOSE_NVHE_SYM(sym))
-#else
-/* The nVHE hypervisor shouldn't even try to access anything */
-extern void *__nvhe_undefined_symbol;
-#define CHOOSE_HYP_SYM(sym)	__nvhe_undefined_symbol
+#define CHOOSE_VHE_SYM(sym)	sym
+#define CHOOSE_NVHE_SYM(sym)	kvm_nvhe_sym(sym)
+
 #endif
 
 /* Translate a kernel address @ptr into its equivalent linear mapping */

From a0e479523e3fb4fa52c351cc4906f38097de789a Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:29 +0100
Subject: [PATCH 26/87] KVM: arm64: Save chosen hyp vector to a percpu variable

Introduce a percpu variable to hold the address of the selected hyp
vector that will be used with guests. This avoids the selection process
each time a guest is being entered and can be used by nVHE when a
separate vector is introduced for the host.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-6-ascull@google.com
---
 arch/arm64/include/asm/kvm_hyp.h | 2 ++
 arch/arm64/kvm/arm.c             | 5 ++++-
 arch/arm64/kvm/hyp/vhe/switch.c  | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 3de99b323061..1e2491da324e 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -12,6 +12,8 @@
 #include <asm/alternative.h>
 #include <asm/sysreg.h>
 
+DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
+
 #define read_sysreg_elx(r,nvh,vh)					\
 	({								\
 		u64 reg;						\
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 1af4c77feda2..77fc856ea513 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -47,6 +47,7 @@ __asm__(".arch_extension	virt");
 #endif
 
 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
 
 /* The VMID used in the VTTBR */
@@ -1276,7 +1277,7 @@ static void cpu_init_hyp_mode(void)
 
 	pgd_ptr = kvm_mmu_get_httbr();
 	hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
-	vector_ptr = (unsigned long)kvm_get_hyp_vector();
+	vector_ptr = __this_cpu_read(kvm_hyp_vector);
 
 	/*
 	 * Call initialization code, and switch to the full blown HYP code.
@@ -1309,6 +1310,8 @@ static void cpu_hyp_reinit(void)
 
 	cpu_hyp_reset();
 
+	__this_cpu_write(kvm_hyp_vector, (unsigned long)kvm_get_hyp_vector());
+
 	if (is_kernel_in_hyp_mode())
 		kvm_timer_init_vhe();
 	else
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index c1fd47557713..b49cf53c11f0 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -59,7 +59,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
 
 	write_sysreg(val, cpacr_el1);
 
-	write_sysreg(kvm_get_hyp_vector(), vbar_el1);
+	write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
 }
 NOKPROBE_SYMBOL(__activate_traps);
 

From 6e3bfbb22c51bd1f121ca5c4bce6b51dc6cdceae Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:30 +0100
Subject: [PATCH 27/87] KVM: arm64: nVHE: Use separate vector for the host

The host is treated differently from the guests when an exception is
taken so introduce a separate vector that is specialized for the host.
This also allows the nVHE specific code to move out of hyp-entry.S and
into nvhe/host.S.

The host is only expected to make HVC calls and anything else is
considered invalid and results in a panic.

Hyp initialization is now passed the vector that is used for the host
and it is swapped for the guest vector during the context switch.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-7-ascull@google.com
---
 arch/arm64/include/asm/kvm_asm.h |   2 +
 arch/arm64/kernel/image-vars.h   |   1 +
 arch/arm64/kvm/arm.c             |  11 +++-
 arch/arm64/kvm/hyp/hyp-entry.S   |  66 -------------------
 arch/arm64/kvm/hyp/nvhe/Makefile |   2 +-
 arch/arm64/kvm/hyp/nvhe/host.S   | 108 +++++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/switch.c |   3 +
 7 files changed, 125 insertions(+), 68 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/nvhe/host.S

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index a952859117b2..fe51c06d480d 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -111,8 +111,10 @@ struct kvm_vcpu;
 struct kvm_s2_mmu;
 
 DECLARE_KVM_NVHE_SYM(__kvm_hyp_init);
+DECLARE_KVM_NVHE_SYM(__kvm_hyp_host_vector);
 DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
 #define __kvm_hyp_init		CHOOSE_NVHE_SYM(__kvm_hyp_init)
+#define __kvm_hyp_host_vector	CHOOSE_NVHE_SYM(__kvm_hyp_host_vector)
 #define __kvm_hyp_vector	CHOOSE_HYP_SYM(__kvm_hyp_vector)
 
 #ifdef CONFIG_KVM_INDIRECT_VECTORS
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8982b68289b7..54bb0eb34b0f 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -71,6 +71,7 @@ KVM_NVHE_ALIAS(kvm_update_va_mask);
 /* Global kernel state accessed by nVHE hyp code. */
 KVM_NVHE_ALIAS(arm64_ssbd_callback_required);
 KVM_NVHE_ALIAS(kvm_host_data);
+KVM_NVHE_ALIAS(kvm_hyp_vector);
 KVM_NVHE_ALIAS(kvm_vgic_global_state);
 
 /* Kernel constant needed to compute idmap addresses. */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 77fc856ea513..b6442c6be5ad 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1277,7 +1277,7 @@ static void cpu_init_hyp_mode(void)
 
 	pgd_ptr = kvm_mmu_get_httbr();
 	hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
-	vector_ptr = __this_cpu_read(kvm_hyp_vector);
+	vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
 
 	/*
 	 * Call initialization code, and switch to the full blown HYP code.
@@ -1542,6 +1542,7 @@ static int init_hyp_mode(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kvm_host_data *cpu_data;
+		unsigned long *vector;
 
 		cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
 		err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
@@ -1550,6 +1551,14 @@ static int init_hyp_mode(void)
 			kvm_err("Cannot map host CPU state: %d\n", err);
 			goto out_err;
 		}
+
+		vector = per_cpu_ptr(&kvm_hyp_vector, cpu);
+		err = create_hyp_mappings(vector, vector + 1, PAGE_HYP);
+
+		if (err) {
+			kvm_err("Cannot map hyp guest vector address\n");
+			goto out_err;
+		}
 	}
 
 	err = hyp_map_aux_data();
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 9cb3fbca5d79..f92489250dfc 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -12,7 +12,6 @@
 #include <asm/cpufeature.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
 #include <asm/mmu.h>
 
 .macro save_caller_saved_regs_vect
@@ -41,20 +40,6 @@
 
 	.text
 
-.macro do_el2_call
-	/*
-	 * Shuffle the parameters before calling the function
-	 * pointed to in x0. Assumes parameters in x[1,2,3].
-	 */
-	str	lr, [sp, #-16]!
-	mov	lr, x0
-	mov	x0, x1
-	mov	x1, x2
-	mov	x2, x3
-	blr	lr
-	ldr	lr, [sp], #16
-.endm
-
 el1_sync:				// Guest trapped into EL2
 
 	mrs	x0, esr_el2
@@ -63,44 +48,6 @@ el1_sync:				// Guest trapped into EL2
 	ccmp	x0, #ESR_ELx_EC_HVC32, #4, ne
 	b.ne	el1_trap
 
-#ifdef __KVM_NVHE_HYPERVISOR__
-	mrs	x1, vttbr_el2		// If vttbr is valid, the guest
-	cbnz	x1, el1_hvc_guest	// called HVC
-
-	/* Here, we're pretty sure the host called HVC. */
-	ldp	x0, x1, [sp], #16
-
-	/* Check for a stub HVC call */
-	cmp	x0, #HVC_STUB_HCALL_NR
-	b.hs	1f
-
-	/*
-	 * Compute the idmap address of __kvm_handle_stub_hvc and
-	 * jump there. Since we use kimage_voffset, do not use the
-	 * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
-	 * (by loading it from the constant pool).
-	 *
-	 * Preserve x0-x4, which may contain stub parameters.
-	 */
-	ldr	x5, =__kvm_handle_stub_hvc
-	ldr_l	x6, kimage_voffset
-
-	/* x5 = __pa(x5) */
-	sub	x5, x5, x6
-	br	x5
-
-1:
-	/*
-	 * Perform the EL2 call
-	 */
-	kern_hyp_va	x0
-	do_el2_call
-
-	eret
-	sb
-#endif /* __KVM_NVHE_HYPERVISOR__ */
-
-el1_hvc_guest:
 	/*
 	 * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
 	 * The workaround has already been applied on the host,
@@ -198,18 +145,6 @@ el2_error:
 	eret
 	sb
 
-#ifdef __KVM_NVHE_HYPERVISOR__
-SYM_FUNC_START(__hyp_do_panic)
-	mov	lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
-		      PSR_MODE_EL1h)
-	msr	spsr_el2, lr
-	ldr	lr, =panic
-	msr	elr_el2, lr
-	eret
-	sb
-SYM_FUNC_END(__hyp_do_panic)
-#endif
-
 .macro invalid_vector	label, target = hyp_panic
 	.align	2
 SYM_CODE_START(\label)
@@ -222,7 +157,6 @@ SYM_CODE_END(\label)
 	invalid_vector	el2t_irq_invalid
 	invalid_vector	el2t_fiq_invalid
 	invalid_vector	el2t_error_invalid
-	invalid_vector	el2h_sync_invalid
 	invalid_vector	el2h_irq_invalid
 	invalid_vector	el2h_fiq_invalid
 	invalid_vector	el1_fiq_invalid
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index aef76487edc2..ddf98eb07b9d 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,7 +6,7 @@
 asflags-y := -D__KVM_NVHE_HYPERVISOR__
 ccflags-y := -D__KVM_NVHE_HYPERVISOR__
 
-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
+obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o
 
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
new file mode 100644
index 000000000000..128af58d342d
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/assembler.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_mmu.h>
+
+	.text
+
+SYM_FUNC_START(__hyp_do_panic)
+	mov	lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
+		      PSR_MODE_EL1h)
+	msr	spsr_el2, lr
+	ldr	lr, =panic
+	msr	elr_el2, lr
+	eret
+	sb
+SYM_FUNC_END(__hyp_do_panic)
+
+.macro host_el1_sync_vect
+	.align 7
+.L__vect_start\@:
+	esb
+	stp	x0, x1, [sp, #-16]!
+	mrs	x0, esr_el2
+	lsr	x0, x0, #ESR_ELx_EC_SHIFT
+	cmp	x0, #ESR_ELx_EC_HVC64
+	ldp	x0, x1, [sp], #16
+	b.ne	hyp_panic
+
+	/* Check for a stub HVC call */
+	cmp	x0, #HVC_STUB_HCALL_NR
+	b.hs	1f
+
+	/*
+	 * Compute the idmap address of __kvm_handle_stub_hvc and
+	 * jump there. Since we use kimage_voffset, do not use the
+	 * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead
+	 * (by loading it from the constant pool).
+	 *
+	 * Preserve x0-x4, which may contain stub parameters.
+	 */
+	ldr	x5, =__kvm_handle_stub_hvc
+	ldr_l	x6, kimage_voffset
+
+	/* x5 = __pa(x5) */
+	sub	x5, x5, x6
+	br	x5
+
+1:
+	/*
+	 * Shuffle the parameters before calling the function
+	 * pointed to in x0. Assumes parameters in x[1,2,3].
+	 */
+	kern_hyp_va	x0
+	str	lr, [sp, #-16]!
+	mov	lr, x0
+	mov	x0, x1
+	mov	x1, x2
+	mov	x2, x3
+	blr	lr
+	ldr	lr, [sp], #16
+
+	eret
+	sb
+.L__vect_end\@:
+.if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
+	.error "host_el1_sync_vect larger than vector entry"
+.endif
+.endm
+
+.macro invalid_host_vect
+	.align 7
+	b	hyp_panic
+.endm
+
+/*
+ * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
+ * host knows about the EL2 vectors already, and there is no point in hiding
+ * them.
+ */
+	.align 11
+SYM_CODE_START(__kvm_hyp_host_vector)
+	invalid_host_vect			// Synchronous EL2t
+	invalid_host_vect			// IRQ EL2t
+	invalid_host_vect			// FIQ EL2t
+	invalid_host_vect			// Error EL2t
+
+	invalid_host_vect			// Synchronous EL2h
+	invalid_host_vect			// IRQ EL2h
+	invalid_host_vect			// FIQ EL2h
+	invalid_host_vect			// Error EL2h
+
+	host_el1_sync_vect			// Synchronous 64-bit EL1
+	invalid_host_vect			// IRQ 64-bit EL1
+	invalid_host_vect			// FIQ 64-bit EL1
+	invalid_host_vect			// Error 64-bit EL1
+
+	invalid_host_vect			// Synchronous 32-bit EL1
+	invalid_host_vect			// IRQ 32-bit EL1
+	invalid_host_vect			// FIQ 32-bit EL1
+	invalid_host_vect			// Error 32-bit EL1
+SYM_CODE_END(__kvm_hyp_host_vector)
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 26d6fd4b6f4d..075384084e08 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -42,6 +42,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
 	}
 
 	write_sysreg(val, cptr_el2);
+	write_sysreg(__hyp_this_cpu_read(kvm_hyp_vector), vbar_el2);
 
 	if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
 		struct kvm_cpu_context *ctxt = &vcpu->arch.ctxt;
@@ -60,6 +61,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
 
 static void __deactivate_traps(struct kvm_vcpu *vcpu)
 {
+	extern char __kvm_hyp_host_vector[];
 	u64 mdcr_el2;
 
 	___deactivate_traps(vcpu);
@@ -91,6 +93,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
 	write_sysreg(mdcr_el2, mdcr_el2);
 	write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2);
 	write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
+	write_sysreg(__kvm_hyp_host_vector, vbar_el2);
 }
 
 static void __load_host_stage2(void)

From 472fc011ccd30f05e0b39d71064777d39dd11cac Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:31 +0100
Subject: [PATCH 28/87] KVM: arm64: nVHE: Don't consume host SErrors with ESB

The ESB at the start of the host vector may cause SErrors to be consumed
to DISR_EL1. However, this is not checked for the host so the SError
could go unhandled.

Remove the ESB so that SErrors are not consumed but are instead left
pending for the host to consume. __guest_enter already defers entry into
a guest if there are any SErrors pending.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: James Morse <james.morse@arm.com>
Link: https://lore.kernel.org/r/20200915104643.2543892-8-ascull@google.com
---
 arch/arm64/kvm/hyp/nvhe/host.S | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 128af58d342d..da21fddcef75 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -25,7 +25,6 @@ SYM_FUNC_END(__hyp_do_panic)
 .macro host_el1_sync_vect
 	.align 7
 .L__vect_start\@:
-	esb
 	stp	x0, x1, [sp, #-16]!
 	mrs	x0, esr_el2
 	lsr	x0, x0, #ESR_ELx_EC_SHIFT
@@ -80,6 +79,11 @@ SYM_FUNC_END(__hyp_do_panic)
 .endm
 
 /*
+ * The host vector does not use an ESB instruction in order to avoid consuming
+ * SErrors that should only be consumed by the host. Guest entry is deferred by
+ * __guest_enter if there are any pending asynchronous exceptions so hyp will
+ * always return to the host without having consumerd host SErrors.
+ *
  * CONFIG_KVM_INDIRECT_VECTORS is not applied to the host vectors because the
  * host knows about the EL2 vectors already, and there is no point in hiding
  * them.

From b619d9aa8b385bdab747df596961d8fdddde4d52 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:32 +0100
Subject: [PATCH 29/87] KVM: arm64: Introduce hyp context

During __guest_enter, save and restore from a new hyp context rather
than the host context. This is preparation for separation of the hyp and
host context in nVHE.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-9-ascull@google.com
---
 arch/arm64/include/asm/kvm_hyp.h        |  3 ++-
 arch/arm64/kernel/image-vars.h          |  1 +
 arch/arm64/kvm/arm.c                    | 10 ++++++++++
 arch/arm64/kvm/hyp/entry.S              | 10 +++++-----
 arch/arm64/kvm/hyp/include/hyp/switch.h |  2 +-
 arch/arm64/kvm/hyp/nvhe/switch.c        |  2 +-
 arch/arm64/kvm/hyp/vhe/switch.c         |  2 +-
 7 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 1e2491da324e..0b525e05e5bf 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -12,6 +12,7 @@
 #include <asm/alternative.h>
 #include <asm/sysreg.h>
 
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 DECLARE_PER_CPU(unsigned long, kvm_hyp_vector);
 
 #define read_sysreg_elx(r,nvh,vh)					\
@@ -89,7 +90,7 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu);
 void deactivate_traps_vhe_put(void);
 #endif
 
-u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
+u64 __guest_enter(struct kvm_vcpu *vcpu);
 
 void __noreturn hyp_panic(void);
 #ifdef __KVM_NVHE_HYPERVISOR__
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 54bb0eb34b0f..9f419e4fc66b 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -71,6 +71,7 @@ KVM_NVHE_ALIAS(kvm_update_va_mask);
 /* Global kernel state accessed by nVHE hyp code. */
 KVM_NVHE_ALIAS(arm64_ssbd_callback_required);
 KVM_NVHE_ALIAS(kvm_host_data);
+KVM_NVHE_ALIAS(kvm_hyp_ctxt);
 KVM_NVHE_ALIAS(kvm_hyp_vector);
 KVM_NVHE_ALIAS(kvm_vgic_global_state);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b6442c6be5ad..ae4b34f91e94 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -47,6 +47,7 @@ __asm__(".arch_extension	virt");
 #endif
 
 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
+DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
 DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
 
@@ -1542,6 +1543,7 @@ static int init_hyp_mode(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kvm_host_data *cpu_data;
+		struct kvm_cpu_context *hyp_ctxt;
 		unsigned long *vector;
 
 		cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
@@ -1552,6 +1554,14 @@ static int init_hyp_mode(void)
 			goto out_err;
 		}
 
+		hyp_ctxt = per_cpu_ptr(&kvm_hyp_ctxt, cpu);
+		err = create_hyp_mappings(hyp_ctxt, hyp_ctxt + 1, PAGE_HYP);
+
+		if (err) {
+			kvm_err("Cannot map hyp context: %d\n", err);
+			goto out_err;
+		}
+
 		vector = per_cpu_ptr(&kvm_hyp_vector, cpu);
 		err = create_hyp_mappings(vector, vector + 1, PAGE_HYP);
 
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 76e7eaf4675e..9551d7f186da 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -57,15 +57,15 @@
 .endm
 
 /*
- * u64 __guest_enter(struct kvm_vcpu *vcpu,
- *		     struct kvm_cpu_context *host_ctxt);
+ * u64 __guest_enter(struct kvm_vcpu *vcpu);
  */
 SYM_FUNC_START(__guest_enter)
 	// x0: vcpu
-	// x1: host context
-	// x2-x17: clobbered by macros
+	// x1-x17: clobbered by macros
 	// x29: guest context
 
+	hyp_adr_this_cpu x1, kvm_hyp_ctxt, x2
+
 	// Store the host regs
 	save_callee_saved_regs x1
 
@@ -148,7 +148,7 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
 	// Store the guest's sp_el0
 	save_sp_el0	x1, x2
 
-	get_host_ctxt	x2, x3
+	hyp_adr_this_cpu x2, kvm_hyp_ctxt, x3
 
 	// Macro ptrauth_switch_to_guest format:
 	// 	ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 96ea3fdd0c20..afe714056b97 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -381,7 +381,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
 	    !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
 		return false;
 
-	ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	ctxt = __hyp_this_cpu_ptr(kvm_hyp_ctxt);
 	__ptrauth_save_key(ctxt, APIA);
 	__ptrauth_save_key(ctxt, APIB);
 	__ptrauth_save_key(ctxt, APDA);
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 075384084e08..c99945cda779 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -209,7 +209,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	do {
 		/* Jump in the fire! */
-		exit_code = __guest_enter(vcpu, host_ctxt);
+		exit_code = __guest_enter(vcpu);
 
 		/* And we're baaack! */
 	} while (fixup_guest_exit(vcpu, &exit_code));
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index b49cf53c11f0..cf477f856e51 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -135,7 +135,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 
 	do {
 		/* Jump in the fire! */
-		exit_code = __guest_enter(vcpu, host_ctxt);
+		exit_code = __guest_enter(vcpu);
 
 		/* And we're baaack! */
 	} while (fixup_guest_exit(vcpu, &exit_code));

From 7c2e76d87f9ce7af47a07ca803343fd5f4aa4ab5 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:33 +0100
Subject: [PATCH 30/87] KVM: arm64: Update context references from host to hyp

Hyp now has its own nominal context for saving and restoring its state
when switching to and from a guest. Update the related comments and
utilities to match the new name.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-10-ascull@google.com
---
 arch/arm64/include/asm/kvm_ptrauth.h |  6 +++---
 arch/arm64/kvm/hyp/entry.S           | 22 +++++++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_ptrauth.h b/arch/arm64/include/asm/kvm_ptrauth.h
index 0ddf98c3ba9f..0cd0965255d2 100644
--- a/arch/arm64/include/asm/kvm_ptrauth.h
+++ b/arch/arm64/include/asm/kvm_ptrauth.h
@@ -60,7 +60,7 @@
 .endm
 
 /*
- * Both ptrauth_switch_to_guest and ptrauth_switch_to_host macros will
+ * Both ptrauth_switch_to_guest and ptrauth_switch_to_hyp macros will
  * check for the presence ARM64_HAS_ADDRESS_AUTH, which is defined as
  * (ARM64_HAS_ADDRESS_AUTH_ARCH || ARM64_HAS_ADDRESS_AUTH_IMP_DEF) and
  * then proceed ahead with the save/restore of Pointer Authentication
@@ -78,7 +78,7 @@ alternative_else_nop_endif
 .L__skip_switch\@:
 .endm
 
-.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
+.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
 alternative_if_not ARM64_HAS_ADDRESS_AUTH
 	b	.L__skip_switch\@
 alternative_else_nop_endif
@@ -96,7 +96,7 @@ alternative_else_nop_endif
 #else /* !CONFIG_ARM64_PTR_AUTH */
 .macro ptrauth_switch_to_guest g_ctxt, reg1, reg2, reg3
 .endm
-.macro ptrauth_switch_to_host g_ctxt, h_ctxt, reg1, reg2, reg3
+.macro ptrauth_switch_to_hyp g_ctxt, h_ctxt, reg1, reg2, reg3
 .endm
 #endif /* CONFIG_ARM64_PTR_AUTH */
 #endif /* __ASSEMBLY__ */
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 9551d7f186da..38cca690a6ff 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -66,16 +66,16 @@ SYM_FUNC_START(__guest_enter)
 
 	hyp_adr_this_cpu x1, kvm_hyp_ctxt, x2
 
-	// Store the host regs
+	// Store the hyp regs
 	save_callee_saved_regs x1
 
-	// Save the host's sp_el0
+	// Save hyp's sp_el0
 	save_sp_el0	x1, x2
 
-	// Now the host state is stored if we have a pending RAS SError it must
-	// affect the host. If any asynchronous exception is pending we defer
-	// the guest entry. The DSB isn't necessary before v8.2 as any SError
-	// would be fatal.
+	// Now the hyp state is stored if we have a pending RAS SError it must
+	// affect the host or hyp. If any asynchronous exception is pending we
+	// defer the guest entry. The DSB isn't necessary before v8.2 as any
+	// SError would be fatal.
 alternative_if ARM64_HAS_RAS_EXTN
 	dsb	nshst
 	isb
@@ -150,17 +150,17 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
 
 	hyp_adr_this_cpu x2, kvm_hyp_ctxt, x3
 
-	// Macro ptrauth_switch_to_guest format:
-	// 	ptrauth_switch_to_host(guest cxt, host cxt, tmp1, tmp2, tmp3)
+	// Macro ptrauth_switch_to_hyp format:
+	// 	ptrauth_switch_to_hyp(guest cxt, host cxt, tmp1, tmp2, tmp3)
 	// The below macro to save/restore keys is not implemented in C code
 	// as it may cause Pointer Authentication key signing mismatch errors
 	// when this feature is enabled for kernel code.
-	ptrauth_switch_to_host x1, x2, x3, x4, x5
+	ptrauth_switch_to_hyp x1, x2, x3, x4, x5
 
-	// Restore the hosts's sp_el0
+	// Restore hyp's sp_el0
 	restore_sp_el0 x2, x3
 
-	// Now restore the host regs
+	// Now restore the hyp regs
 	restore_callee_saved_regs x2
 
 alternative_if ARM64_HAS_RAS_EXTN

From 7db21530479f071ee0e0a4d5fcf5e6bc6c0352ba Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:34 +0100
Subject: [PATCH 31/87] KVM: arm64: Restore hyp when panicking in guest context

If the guest context is loaded when a panic is triggered, restore the
hyp context so e.g. the shadow call stack works when hyp_panic() is
called and SP_EL0 is valid when the host's panic() is called.

Use the hyp context's __hyp_running_vcpu field to track when hyp
transitions to and from the guest vcpu so the exception handlers know
whether the context needs to be restored.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-11-ascull@google.com
---
 arch/arm64/include/asm/kvm_asm.h        | 10 ++++++++++
 arch/arm64/kvm/hyp/entry.S              | 24 ++++++++++++++++++++++++
 arch/arm64/kvm/hyp/hyp-entry.S          |  5 ++---
 arch/arm64/kvm/hyp/include/hyp/switch.h |  4 +++-
 arch/arm64/kvm/hyp/nvhe/host.S          |  5 +++++
 5 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index fe51c06d480d..4df2bd8882bc 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -236,6 +236,16 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
 	ldr	\vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
 .endm
 
+.macro get_loaded_vcpu vcpu, ctxt
+	hyp_adr_this_cpu \ctxt, kvm_hyp_ctxt, \vcpu
+	ldr	\vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+.endm
+
+.macro set_loaded_vcpu vcpu, ctxt, tmp
+	hyp_adr_this_cpu \ctxt, kvm_hyp_ctxt, \tmp
+	str	\vcpu, [\ctxt, #HOST_CONTEXT_VCPU]
+.endm
+
 /*
  * KVM extable for unexpected exceptions.
  * In the same format _asm_extable, but output to a different section so that
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 38cca690a6ff..4787fc82790c 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -86,6 +86,8 @@ alternative_else_nop_endif
 	ret
 
 1:
+	set_loaded_vcpu x0, x1, x2
+
 	add	x29, x0, #VCPU_CONTEXT
 
 	// Macro ptrauth_switch_to_guest format:
@@ -116,6 +118,26 @@ alternative_else_nop_endif
 	eret
 	sb
 
+SYM_INNER_LABEL(__guest_exit_panic, SYM_L_GLOBAL)
+	// x2-x29,lr: vcpu regs
+	// vcpu x0-x1 on the stack
+
+	// If the hyp context is loaded, go straight to hyp_panic
+	get_loaded_vcpu x0, x1
+	cbz	x0, hyp_panic
+
+	// The hyp context is saved so make sure it is restored to allow
+	// hyp_panic to run at hyp and, subsequently, panic to run in the host.
+	// This makes use of __guest_exit to avoid duplication but sets the
+	// return address to tail call into hyp_panic. As a side effect, the
+	// current state is saved to the guest context but it will only be
+	// accurate if the guest had been completely restored.
+	hyp_adr_this_cpu x0, kvm_hyp_ctxt, x1
+	adr	x1, hyp_panic
+	str	x1, [x0, #CPU_XREG_OFFSET(30)]
+
+	get_vcpu_ptr	x1, x0
+
 SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
 	// x0: return code
 	// x1: vcpu
@@ -163,6 +185,8 @@ SYM_INNER_LABEL(__guest_exit, SYM_L_GLOBAL)
 	// Now restore the hyp regs
 	restore_callee_saved_regs x2
 
+	set_loaded_vcpu xzr, x1, x2
+
 alternative_if ARM64_HAS_RAS_EXTN
 	// If we have the RAS extensions we can consume a pending error
 	// without an unmask-SError and isb. The ESB-instruction consumed any
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index f92489250dfc..bc9f53df46f5 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -145,7 +145,7 @@ el2_error:
 	eret
 	sb
 
-.macro invalid_vector	label, target = hyp_panic
+.macro invalid_vector	label, target = __guest_exit_panic
 	.align	2
 SYM_CODE_START(\label)
 	b \target
@@ -186,10 +186,9 @@ check_preamble_length 661b, 662b
 .macro invalid_vect target
 	.align 7
 661:
-	b	\target
 	nop
+	stp	x0, x1, [sp, #-16]!
 662:
-	ldp	x0, x1, [sp], #16
 	b	\target
 
 check_preamble_length 661b, 662b
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index afe714056b97..821721b78ad9 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -509,6 +509,7 @@ static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
 
 static inline void __kvm_unexpected_el2_exception(void)
 {
+	extern char __guest_exit_panic[];
 	unsigned long addr, fixup;
 	struct exception_table_entry *entry, *end;
 	unsigned long elr_el2 = read_sysreg(elr_el2);
@@ -529,7 +530,8 @@ static inline void __kvm_unexpected_el2_exception(void)
 		return;
 	}
 
-	hyp_panic();
+	/* Trigger a panic after restoring the hyp context. */
+	write_sysreg(__guest_exit_panic, elr_el2);
 }
 
 #endif /* __ARM64_KVM_HYP_SWITCH_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index da21fddcef75..9ab7814e6114 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -75,6 +75,11 @@ SYM_FUNC_END(__hyp_do_panic)
 
 .macro invalid_host_vect
 	.align 7
+	/* If a guest is loaded, panic out of it. */
+	stp	x0, x1, [sp, #-16]!
+	get_loaded_vcpu x0, x1
+	cbnz	x0, __guest_exit_panic
+	add	sp, sp, #16
 	b	hyp_panic
 .endm
 

From 603d2bdaa57e48a33d57eeb88971b88fbe5875d6 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:35 +0100
Subject: [PATCH 32/87] KVM: arm64: Share context save and restore macros

To avoid duplicating the context save and restore macros, move them into
a shareable header.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-12-ascull@google.com
---
 arch/arm64/include/asm/kvm_asm.h | 39 ++++++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/entry.S       | 39 --------------------------------
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 4df2bd8882bc..4e3073858346 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -261,6 +261,45 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
 	.popsection
 .endm
 
+#define CPU_XREG_OFFSET(x)	(CPU_USER_PT_REGS + 8*x)
+#define CPU_LR_OFFSET		CPU_XREG_OFFSET(30)
+#define CPU_SP_EL0_OFFSET	(CPU_LR_OFFSET + 8)
+
+/*
+ * We treat x18 as callee-saved as the host may use it as a platform
+ * register (e.g. for shadow call stack).
+ */
+.macro save_callee_saved_regs ctxt
+	str	x18,      [\ctxt, #CPU_XREG_OFFSET(18)]
+	stp	x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+	stp	x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+	stp	x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+	stp	x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+	stp	x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+	stp	x29, lr,  [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro restore_callee_saved_regs ctxt
+	// We require \ctxt is not x18-x28
+	ldr	x18,      [\ctxt, #CPU_XREG_OFFSET(18)]
+	ldp	x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
+	ldp	x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
+	ldp	x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
+	ldp	x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
+	ldp	x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
+	ldp	x29, lr,  [\ctxt, #CPU_XREG_OFFSET(29)]
+.endm
+
+.macro save_sp_el0 ctxt, tmp
+	mrs	\tmp,	sp_el0
+	str	\tmp,	[\ctxt, #CPU_SP_EL0_OFFSET]
+.endm
+
+.macro restore_sp_el0 ctxt, tmp
+	ldr	\tmp,	  [\ctxt, #CPU_SP_EL0_OFFSET]
+	msr	sp_el0, \tmp
+.endm
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 4787fc82790c..afaa8d1f2485 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -7,7 +7,6 @@
 #include <linux/linkage.h>
 
 #include <asm/alternative.h>
-#include <asm/asm-offsets.h>
 #include <asm/assembler.h>
 #include <asm/fpsimdmacros.h>
 #include <asm/kvm.h>
@@ -16,46 +15,8 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_ptrauth.h>
 
-#define CPU_XREG_OFFSET(x)	(CPU_USER_PT_REGS + 8*x)
-#define CPU_SP_EL0_OFFSET	(CPU_XREG_OFFSET(30) + 8)
-
 	.text
 
-/*
- * We treat x18 as callee-saved as the host may use it as a platform
- * register (e.g. for shadow call stack).
- */
-.macro save_callee_saved_regs ctxt
-	str	x18,      [\ctxt, #CPU_XREG_OFFSET(18)]
-	stp	x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
-	stp	x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
-	stp	x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
-	stp	x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
-	stp	x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
-	stp	x29, lr,  [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro restore_callee_saved_regs ctxt
-	// We require \ctxt is not x18-x28
-	ldr	x18,      [\ctxt, #CPU_XREG_OFFSET(18)]
-	ldp	x19, x20, [\ctxt, #CPU_XREG_OFFSET(19)]
-	ldp	x21, x22, [\ctxt, #CPU_XREG_OFFSET(21)]
-	ldp	x23, x24, [\ctxt, #CPU_XREG_OFFSET(23)]
-	ldp	x25, x26, [\ctxt, #CPU_XREG_OFFSET(25)]
-	ldp	x27, x28, [\ctxt, #CPU_XREG_OFFSET(27)]
-	ldp	x29, lr,  [\ctxt, #CPU_XREG_OFFSET(29)]
-.endm
-
-.macro save_sp_el0 ctxt, tmp
-	mrs	\tmp,	sp_el0
-	str	\tmp,	[\ctxt, #CPU_SP_EL0_OFFSET]
-.endm
-
-.macro restore_sp_el0 ctxt, tmp
-	ldr	\tmp,	  [\ctxt, #CPU_SP_EL0_OFFSET]
-	msr	sp_el0, \tmp
-.endm
-
 /*
  * u64 __guest_enter(struct kvm_vcpu *vcpu);
  */

From 4e3393a969a0bdaae83263918b6824b2d08c3996 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:36 +0100
Subject: [PATCH 33/87] KVM: arm64: nVHE: Switch to hyp context for EL2

Save and restore the host context when switching to and from hyp. This
gives hyp its own context that the host will not see as a step towards a
full trust boundary between the two.

SP_EL0 and pointer authentication keys are currently shared between the
host and hyp so don't need to be switched yet.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-13-ascull@google.com
---
 arch/arm64/kvm/hyp/include/hyp/switch.h |  2 +
 arch/arm64/kvm/hyp/nvhe/Makefile        |  2 +-
 arch/arm64/kvm/hyp/nvhe/host.S          | 68 ++++++++++++++++++-------
 arch/arm64/kvm/hyp/nvhe/hyp-main.c      | 36 +++++++++++++
 4 files changed, 89 insertions(+), 19 deletions(-)
 create mode 100644 arch/arm64/kvm/hyp/nvhe/hyp-main.c

diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 821721b78ad9..4536b50ddc06 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -372,6 +372,8 @@ static inline bool esr_is_ptrauth_trap(u32 esr)
 	ctxt_sys_reg(ctxt, key ## KEYHI_EL1) = __val;                   \
 } while(0)
 
+DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
+
 static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpu_context *ctxt;
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index ddf98eb07b9d..46c89e8c30bc 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -6,7 +6,7 @@
 asflags-y := -D__KVM_NVHE_HYPERVISOR__
 ccflags-y := -D__KVM_NVHE_HYPERVISOR__
 
-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o
+obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o hyp-main.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o
 
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index 9ab7814e6114..d26e41773dc4 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -12,6 +12,55 @@
 
 	.text
 
+SYM_FUNC_START(__host_exit)
+	stp	x0, x1, [sp, #-16]!
+
+	get_host_ctxt	x0, x1
+
+	ALTERNATIVE(nop, SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN)
+
+	/* Store the host regs x2 and x3 */
+	stp	x2, x3,   [x0, #CPU_XREG_OFFSET(2)]
+
+	/* Retrieve the host regs x0-x1 from the stack */
+	ldp	x2, x3, [sp], #16	// x0, x1
+
+	/* Store the host regs x0-x1 and x4-x17 */
+	stp	x2, x3,   [x0, #CPU_XREG_OFFSET(0)]
+	stp	x4, x5,   [x0, #CPU_XREG_OFFSET(4)]
+	stp	x6, x7,   [x0, #CPU_XREG_OFFSET(6)]
+	stp	x8, x9,   [x0, #CPU_XREG_OFFSET(8)]
+	stp	x10, x11, [x0, #CPU_XREG_OFFSET(10)]
+	stp	x12, x13, [x0, #CPU_XREG_OFFSET(12)]
+	stp	x14, x15, [x0, #CPU_XREG_OFFSET(14)]
+	stp	x16, x17, [x0, #CPU_XREG_OFFSET(16)]
+
+	/* Store the host regs x18-x29, lr */
+	save_callee_saved_regs x0
+
+	/* Save the host context pointer in x29 across the function call */
+	mov	x29, x0
+	bl	handle_trap
+
+	/* Restore host regs x0-x17 */
+	ldp	x0, x1,   [x29, #CPU_XREG_OFFSET(0)]
+	ldp	x2, x3,   [x29, #CPU_XREG_OFFSET(2)]
+	ldp	x4, x5,   [x29, #CPU_XREG_OFFSET(4)]
+	ldp	x6, x7,   [x29, #CPU_XREG_OFFSET(6)]
+	ldp	x8, x9,   [x29, #CPU_XREG_OFFSET(8)]
+	ldp	x10, x11, [x29, #CPU_XREG_OFFSET(10)]
+	ldp	x12, x13, [x29, #CPU_XREG_OFFSET(12)]
+	ldp	x14, x15, [x29, #CPU_XREG_OFFSET(14)]
+	ldp	x16, x17, [x29, #CPU_XREG_OFFSET(16)]
+
+	/* Restore host regs x18-x29, lr */
+	restore_callee_saved_regs x29
+
+	/* Do not touch any register after this! */
+	eret
+	sb
+SYM_FUNC_END(__host_exit)
+
 SYM_FUNC_START(__hyp_do_panic)
 	mov	lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
 		      PSR_MODE_EL1h)
@@ -34,7 +83,7 @@ SYM_FUNC_END(__hyp_do_panic)
 
 	/* Check for a stub HVC call */
 	cmp	x0, #HVC_STUB_HCALL_NR
-	b.hs	1f
+	b.hs	__host_exit
 
 	/*
 	 * Compute the idmap address of __kvm_handle_stub_hvc and
@@ -50,23 +99,6 @@ SYM_FUNC_END(__hyp_do_panic)
 	/* x5 = __pa(x5) */
 	sub	x5, x5, x6
 	br	x5
-
-1:
-	/*
-	 * Shuffle the parameters before calling the function
-	 * pointed to in x0. Assumes parameters in x[1,2,3].
-	 */
-	kern_hyp_va	x0
-	str	lr, [sp, #-16]!
-	mov	lr, x0
-	mov	x0, x1
-	mov	x1, x2
-	mov	x2, x3
-	blr	lr
-	ldr	lr, [sp], #16
-
-	eret
-	sb
 .L__vect_end\@:
 .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80)
 	.error "host_el1_sync_vect larger than vector entry"
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
new file mode 100644
index 000000000000..570c3896f42e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - Google Inc
+ * Author: Andrew Scull <ascull@google.com>
+ */
+
+#include <hyp/switch.h>
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_hyp.h>
+#include <asm/kvm_mmu.h>
+
+typedef unsigned long (*hypcall_fn_t)
+	(unsigned long, unsigned long, unsigned long);
+
+void handle_trap(struct kvm_cpu_context *host_ctxt)
+{
+	u64 esr = read_sysreg_el2(SYS_ESR);
+	hypcall_fn_t func;
+	unsigned long ret;
+
+	if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
+		hyp_panic();
+
+	/*
+	 * __kvm_call_hyp takes a pointer in the host address space and
+	 * up to three arguments.
+	 */
+	func = (hypcall_fn_t)kern_hyp_va(host_ctxt->regs.regs[0]);
+	ret = func(host_ctxt->regs.regs[1],
+		   host_ctxt->regs.regs[2],
+		   host_ctxt->regs.regs[3]);
+	host_ctxt->regs.regs[0] = ret;
+}

From a2e102e20fd69ce26ea9fe7bd0df26d25dafff59 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:37 +0100
Subject: [PATCH 34/87] KVM: arm64: nVHE: Handle hyp panics

Restore the host context when panicking from hyp to give the best chance
of the panic being clean.

The host requires that registers be preserved such as x18 for the shadow
callstack. If the panic is caused by an exception from EL1, the host
context is still valid so the panic can return straight back to the
host. If the panic comes from EL2 then it's most likely that the hyp
context is active and the host context needs to be restored.

There are windows before and after the host context is saved and
restored that restoration is attempted incorrectly and the panic won't
be clean.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-14-ascull@google.com
---
 arch/arm64/include/asm/kvm_hyp.h |  2 +-
 arch/arm64/kvm/hyp/nvhe/host.S   | 76 ++++++++++++++++++++++++--------
 arch/arm64/kvm/hyp/nvhe/switch.c | 18 +++-----
 3 files changed, 63 insertions(+), 33 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 0b525e05e5bf..6b664de5ec1f 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -94,7 +94,7 @@ u64 __guest_enter(struct kvm_vcpu *vcpu);
 
 void __noreturn hyp_panic(void);
 #ifdef __KVM_NVHE_HYPERVISOR__
-void __noreturn __hyp_do_panic(unsigned long, ...);
+void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
 #endif
 
 #endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index d26e41773dc4..ff9a0f547b9f 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -47,6 +47,9 @@ SYM_FUNC_START(__host_exit)
 	ldp	x2, x3,   [x29, #CPU_XREG_OFFSET(2)]
 	ldp	x4, x5,   [x29, #CPU_XREG_OFFSET(4)]
 	ldp	x6, x7,   [x29, #CPU_XREG_OFFSET(6)]
+
+	/* x0-7 are use for panic arguments */
+__host_enter_for_panic:
 	ldp	x8, x9,   [x29, #CPU_XREG_OFFSET(8)]
 	ldp	x10, x11, [x29, #CPU_XREG_OFFSET(10)]
 	ldp	x12, x13, [x29, #CPU_XREG_OFFSET(12)]
@@ -57,18 +60,38 @@ SYM_FUNC_START(__host_exit)
 	restore_callee_saved_regs x29
 
 	/* Do not touch any register after this! */
+__host_enter_without_restoring:
 	eret
 	sb
 SYM_FUNC_END(__host_exit)
 
+/*
+ * void __noreturn __hyp_do_panic(bool restore_host, u64 spsr, u64 elr, u64 par);
+ */
 SYM_FUNC_START(__hyp_do_panic)
+	/* Load the format arguments into x1-7 */
+	mov	x6, x3
+	get_vcpu_ptr x7, x3
+
+	mrs	x3, esr_el2
+	mrs	x4, far_el2
+	mrs	x5, hpfar_el2
+
+	/* Prepare and exit to the host's panic funciton. */
 	mov	lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\
 		      PSR_MODE_EL1h)
 	msr	spsr_el2, lr
 	ldr	lr, =panic
 	msr	elr_el2, lr
-	eret
-	sb
+
+	/*
+	 * Set the panic format string and enter the host, conditionally
+	 * restoring the host context.
+	 */
+	cmp	x0, xzr
+	ldr	x0, =__hyp_panic_string
+	b.eq	__host_enter_without_restoring
+	b	__host_enter_for_panic
 SYM_FUNC_END(__hyp_do_panic)
 
 .macro host_el1_sync_vect
@@ -79,7 +102,7 @@ SYM_FUNC_END(__hyp_do_panic)
 	lsr	x0, x0, #ESR_ELx_EC_SHIFT
 	cmp	x0, #ESR_ELx_EC_HVC64
 	ldp	x0, x1, [sp], #16
-	b.ne	hyp_panic
+	b.ne	__host_exit
 
 	/* Check for a stub HVC call */
 	cmp	x0, #HVC_STUB_HCALL_NR
@@ -105,16 +128,31 @@ SYM_FUNC_END(__hyp_do_panic)
 .endif
 .endm
 
-.macro invalid_host_vect
+.macro invalid_host_el2_vect
 	.align 7
 	/* If a guest is loaded, panic out of it. */
 	stp	x0, x1, [sp, #-16]!
 	get_loaded_vcpu x0, x1
 	cbnz	x0, __guest_exit_panic
 	add	sp, sp, #16
+
+	/*
+	 * The panic may not be clean if the exception is taken before the host
+	 * context has been saved by __host_exit or after the hyp context has
+	 * been partially clobbered by __host_enter.
+	 */
 	b	hyp_panic
 .endm
 
+.macro invalid_host_el1_vect
+	.align 7
+	mov	x0, xzr		/* restore_host = false */
+	mrs	x1, spsr_el2
+	mrs	x2, elr_el2
+	mrs	x3, par_el1
+	b	__hyp_do_panic
+.endm
+
 /*
  * The host vector does not use an ESB instruction in order to avoid consuming
  * SErrors that should only be consumed by the host. Guest entry is deferred by
@@ -127,23 +165,23 @@ SYM_FUNC_END(__hyp_do_panic)
  */
 	.align 11
 SYM_CODE_START(__kvm_hyp_host_vector)
-	invalid_host_vect			// Synchronous EL2t
-	invalid_host_vect			// IRQ EL2t
-	invalid_host_vect			// FIQ EL2t
-	invalid_host_vect			// Error EL2t
+	invalid_host_el2_vect			// Synchronous EL2t
+	invalid_host_el2_vect			// IRQ EL2t
+	invalid_host_el2_vect			// FIQ EL2t
+	invalid_host_el2_vect			// Error EL2t
 
-	invalid_host_vect			// Synchronous EL2h
-	invalid_host_vect			// IRQ EL2h
-	invalid_host_vect			// FIQ EL2h
-	invalid_host_vect			// Error EL2h
+	invalid_host_el2_vect			// Synchronous EL2h
+	invalid_host_el2_vect			// IRQ EL2h
+	invalid_host_el2_vect			// FIQ EL2h
+	invalid_host_el2_vect			// Error EL2h
 
 	host_el1_sync_vect			// Synchronous 64-bit EL1
-	invalid_host_vect			// IRQ 64-bit EL1
-	invalid_host_vect			// FIQ 64-bit EL1
-	invalid_host_vect			// Error 64-bit EL1
+	invalid_host_el1_vect			// IRQ 64-bit EL1
+	invalid_host_el1_vect			// FIQ 64-bit EL1
+	invalid_host_el1_vect			// Error 64-bit EL1
 
-	invalid_host_vect			// Synchronous 32-bit EL1
-	invalid_host_vect			// IRQ 32-bit EL1
-	invalid_host_vect			// FIQ 32-bit EL1
-	invalid_host_vect			// Error 32-bit EL1
+	invalid_host_el1_vect			// Synchronous 32-bit EL1
+	invalid_host_el1_vect			// IRQ 32-bit EL1
+	invalid_host_el1_vect			// FIQ 32-bit EL1
+	invalid_host_el1_vect			// Error 32-bit EL1
 SYM_CODE_END(__kvm_hyp_host_vector)
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index c99945cda779..29febf9a93f2 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -242,6 +242,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (system_uses_irq_prio_masking())
 		gic_write_pmr(GIC_PRIO_IRQOFF);
 
+	host_ctxt->__hyp_running_vcpu = NULL;
+
 	return exit_code;
 }
 
@@ -250,31 +252,21 @@ void __noreturn hyp_panic(void)
 	u64 spsr = read_sysreg_el2(SYS_SPSR);
 	u64 elr = read_sysreg_el2(SYS_ELR);
 	u64 par = read_sysreg(par_el1);
+	bool restore_host = true;
 	struct kvm_cpu_context *host_ctxt;
 	struct kvm_vcpu *vcpu;
-	unsigned long str_va;
 
 	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
 	vcpu = host_ctxt->__hyp_running_vcpu;
 
-	if (read_sysreg(vttbr_el2)) {
+	if (vcpu) {
 		__timer_disable_traps(vcpu);
 		__deactivate_traps(vcpu);
 		__load_host_stage2();
 		__sysreg_restore_state_nvhe(host_ctxt);
 	}
 
-	/*
-	 * Force the panic string to be loaded from the literal pool,
-	 * making sure it is a kernel address and not a PC-relative
-	 * reference.
-	 */
-	asm volatile("ldr %0, =%1" : "=r" (str_va) : "S" (__hyp_panic_string));
-
-	__hyp_do_panic(str_va,
-		       spsr, elr,
-		       read_sysreg(esr_el2), read_sysreg_el2(SYS_FAR),
-		       read_sysreg(hpfar_el2), par, vcpu);
+	__hyp_do_panic(restore_host, spsr, elr, par);
 	unreachable();
 }
 

From 5dc33bd199ca3b9f53c28144fa9a20b43c908972 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:38 +0100
Subject: [PATCH 35/87] KVM: arm64: nVHE: Pass pointers consistently to
 hyp-init

Rather than some being kernel pointer and others being hyp pointers,
standardize on all pointers being hyp pointers.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-15-ascull@google.com
---
 arch/arm64/kvm/arm.c               | 1 +
 arch/arm64/kvm/hyp/nvhe/hyp-init.S | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index ae4b34f91e94..6b7180072c8d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1278,6 +1278,7 @@ static void cpu_init_hyp_mode(void)
 
 	pgd_ptr = kvm_mmu_get_httbr();
 	hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
+	hyp_stack_ptr = kern_hyp_va(hyp_stack_ptr);
 	vector_ptr = (unsigned long)kern_hyp_va(kvm_ksym_ref(__kvm_hyp_host_vector));
 
 	/*
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index d9434e90c06d..abe885e26fe2 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -114,7 +114,6 @@ alternative_else_nop_endif
 	isb
 
 	/* Set the stack and new vectors */
-	kern_hyp_va	x1
 	mov	sp, x1
 	msr	vbar_el2, x2
 

From cf6501689012ef346cc835a38cbebb1e398d0834 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:39 +0100
Subject: [PATCH 36/87] smccc: Define vendor hyp owned service call region

Vendor specific hypervisor services have their own region of function
identifiers reserved by SMCCC. Extend the list of owners to include this
case.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20200915104643.2543892-16-ascull@google.com
---
 include/linux/arm-smccc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 15c706fb0a37..ee286f5de239 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -49,6 +49,7 @@
 #define ARM_SMCCC_OWNER_OEM		3
 #define ARM_SMCCC_OWNER_STANDARD	4
 #define ARM_SMCCC_OWNER_STANDARD_HYP	5
+#define ARM_SMCCC_OWNER_VENDOR_HYP	6
 #define ARM_SMCCC_OWNER_TRUSTED_APP	48
 #define ARM_SMCCC_OWNER_TRUSTED_APP_END	49
 #define ARM_SMCCC_OWNER_TRUSTED_OS	50

From 0794a974d74dc777a212a3c58dd236f507360348 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:40 +0100
Subject: [PATCH 37/87] smccc: Use separate variables for args and results

Using the same register-bound variable for both arguments and results
means these values share a type. That type must allow the arguments to
be assigned to it and must also be assignable to the unsigned long
fields of struct arm_smccc_res.

This restriction on types causes compiler warnings when the argument
cannot be implicitly assigned to an unsigned long, for example the
pointers that are used in the KVM hyp interface.

By separating the arguments and results into their own variables, the
type constraint is lifted allowing the arguments to avoid the need for
any type conversion.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/20200915104643.2543892-17-ascull@google.com
---
 include/linux/arm-smccc.h | 73 ++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 44 deletions(-)

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index ee286f5de239..885c9ffc835c 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -228,87 +228,67 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 #define __count_args(...)						\
 	___count_args(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0)
 
-#define __constraint_write_0						\
-	"+r" (r0), "=&r" (r1), "=&r" (r2), "=&r" (r3)
-#define __constraint_write_1						\
-	"+r" (r0), "+r" (r1), "=&r" (r2), "=&r" (r3)
-#define __constraint_write_2						\
-	"+r" (r0), "+r" (r1), "+r" (r2), "=&r" (r3)
-#define __constraint_write_3						\
-	"+r" (r0), "+r" (r1), "+r" (r2), "+r" (r3)
-#define __constraint_write_4	__constraint_write_3
-#define __constraint_write_5	__constraint_write_4
-#define __constraint_write_6	__constraint_write_5
-#define __constraint_write_7	__constraint_write_6
-
-#define __constraint_read_0
-#define __constraint_read_1
-#define __constraint_read_2
-#define __constraint_read_3
-#define __constraint_read_4	"r" (r4)
-#define __constraint_read_5	__constraint_read_4, "r" (r5)
-#define __constraint_read_6	__constraint_read_5, "r" (r6)
-#define __constraint_read_7	__constraint_read_6, "r" (r7)
+#define __constraint_read_0	"r" (arg0)
+#define __constraint_read_1	__constraint_read_0, "r" (arg1)
+#define __constraint_read_2	__constraint_read_1, "r" (arg2)
+#define __constraint_read_3	__constraint_read_2, "r" (arg3)
+#define __constraint_read_4	__constraint_read_3, "r" (arg4)
+#define __constraint_read_5	__constraint_read_4, "r" (arg5)
+#define __constraint_read_6	__constraint_read_5, "r" (arg6)
+#define __constraint_read_7	__constraint_read_6, "r" (arg7)
 
 #define __declare_arg_0(a0, res)					\
 	struct arm_smccc_res   *___res = res;				\
-	register unsigned long r0 asm("r0") = (u32)a0;			\
-	register unsigned long r1 asm("r1");				\
-	register unsigned long r2 asm("r2");				\
-	register unsigned long r3 asm("r3")
+	register unsigned long arg0 asm("r0") = (u32)a0
 
 #define __declare_arg_1(a0, a1, res)					\
 	typeof(a1) __a1 = a1;						\
 	struct arm_smccc_res   *___res = res;				\
-	register unsigned long r0 asm("r0") = (u32)a0;			\
-	register unsigned long r1 asm("r1") = __a1;			\
-	register unsigned long r2 asm("r2");				\
-	register unsigned long r3 asm("r3")
+	register unsigned long arg0 asm("r0") = (u32)a0;			\
+	register typeof(a1) arg1 asm("r1") = __a1
 
 #define __declare_arg_2(a0, a1, a2, res)				\
 	typeof(a1) __a1 = a1;						\
 	typeof(a2) __a2 = a2;						\
 	struct arm_smccc_res   *___res = res;				\
-	register unsigned long r0 asm("r0") = (u32)a0;			\
-	register unsigned long r1 asm("r1") = __a1;			\
-	register unsigned long r2 asm("r2") = __a2;			\
-	register unsigned long r3 asm("r3")
+	register unsigned long arg0 asm("r0") = (u32)a0;			\
+	register typeof(a1) arg1 asm("r1") = __a1;			\
+	register typeof(a2) arg2 asm("r2") = __a2
 
 #define __declare_arg_3(a0, a1, a2, a3, res)				\
 	typeof(a1) __a1 = a1;						\
 	typeof(a2) __a2 = a2;						\
 	typeof(a3) __a3 = a3;						\
 	struct arm_smccc_res   *___res = res;				\
-	register unsigned long r0 asm("r0") = (u32)a0;			\
-	register unsigned long r1 asm("r1") = __a1;			\
-	register unsigned long r2 asm("r2") = __a2;			\
-	register unsigned long r3 asm("r3") = __a3
+	register unsigned long arg0 asm("r0") = (u32)a0;			\
+	register typeof(a1) arg1 asm("r1") = __a1;			\
+	register typeof(a2) arg2 asm("r2") = __a2;			\
+	register typeof(a3) arg3 asm("r3") = __a3
 
 #define __declare_arg_4(a0, a1, a2, a3, a4, res)			\
 	typeof(a4) __a4 = a4;						\
 	__declare_arg_3(a0, a1, a2, a3, res);				\
-	register unsigned long r4 asm("r4") = __a4
+	register typeof(a4) arg4 asm("r4") = __a4
 
 #define __declare_arg_5(a0, a1, a2, a3, a4, a5, res)			\
 	typeof(a5) __a5 = a5;						\
 	__declare_arg_4(a0, a1, a2, a3, a4, res);			\
-	register unsigned long r5 asm("r5") = __a5
+	register typeof(a5) arg5 asm("r5") = __a5
 
 #define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res)		\
 	typeof(a6) __a6 = a6;						\
 	__declare_arg_5(a0, a1, a2, a3, a4, a5, res);			\
-	register unsigned long r6 asm("r6") = __a6
+	register typeof(a6) arg6 asm("r6") = __a6
 
 #define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res)		\
 	typeof(a7) __a7 = a7;						\
 	__declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res);		\
-	register unsigned long r7 asm("r7") = __a7
+	register typeof(a7) arg7 asm("r7") = __a7
 
 #define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__)
 #define __declare_args(count, ...)  ___declare_args(count, __VA_ARGS__)
 
 #define ___constraints(count)						\
-	: __constraint_write_ ## count					\
 	: __constraint_read_ ## count					\
 	: "memory"
 #define __constraints(count)	___constraints(count)
@@ -320,8 +300,13 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
  */
 #define __arm_smccc_1_1(inst, ...)					\
 	do {								\
+		register unsigned long r0 asm("r0");			\
+		register unsigned long r1 asm("r1");			\
+		register unsigned long r2 asm("r2");			\
+		register unsigned long r3 asm("r3"); 			\
 		__declare_args(__count_args(__VA_ARGS__), __VA_ARGS__);	\
-		asm volatile(inst "\n"					\
+		asm volatile(inst "\n" :				\
+			     "=r" (r0), "=r" (r1), "=r" (r2), "=r" (r3)	\
 			     __constraints(__count_args(__VA_ARGS__)));	\
 		if (___res)						\
 			*___res = (typeof(*___res)){r0, r1, r2, r3};	\
@@ -367,7 +352,7 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 #define __fail_smccc_1_1(...)						\
 	do {								\
 		__declare_args(__count_args(__VA_ARGS__), __VA_ARGS__);	\
-		asm ("" __constraints(__count_args(__VA_ARGS__)));	\
+		asm ("" : __constraints(__count_args(__VA_ARGS__)));	\
 		if (___res)						\
 			___res->a0 = SMCCC_RET_NOT_SUPPORTED;		\
 	} while (0)

From 054698316d87a13e4c0dc8f5ecdd77b12e5eb676 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:41 +0100
Subject: [PATCH 38/87] KVM: arm64: nVHE: Migrate hyp interface to SMCCC

Rather than passing arbitrary function pointers to run at hyp, define
and equivalent set of SMCCC functions.

Since the SMCCC functions are strongly tied to the original function
prototypes, it is not expected for the host to ever call an invalid ID
but a warning is raised if this does ever occur.

As __kvm_vcpu_run is used for every switch between the host and a guest,
it is explicitly singled out to be identified before the other function
IDs to improve the performance of the hot path.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-18-ascull@google.com
---
 arch/arm64/include/asm/kvm_asm.h   |  24 +++++++
 arch/arm64/include/asm/kvm_host.h  |  25 ++++---
 arch/arm64/kvm/arm.c               |   2 +-
 arch/arm64/kvm/hyp.S               |  24 ++-----
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 107 +++++++++++++++++++++++++----
 5 files changed, 139 insertions(+), 43 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 4e3073858346..3e4577013d33 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -38,6 +38,30 @@
 
 #define __SMCCC_WORKAROUND_1_SMC_SZ 36
 
+#define KVM_HOST_SMCCC_ID(id)						\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,				\
+			   ARM_SMCCC_SMC_64,				\
+			   ARM_SMCCC_OWNER_VENDOR_HYP,			\
+			   (id))
+
+#define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name)
+
+#define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init			0
+#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run			1
+#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context		2
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa		3
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid		4
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_vmid	5
+#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff		6
+#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs			7
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_ich_vtr_el2		8
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr		9
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr		10
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs		11
+#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2		12
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs		13
+#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs		14
+
 #ifndef __ASSEMBLY__
 
 #include <linux/mm.h>
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 16adbefde1cc..82c941cf8890 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -11,6 +11,7 @@
 #ifndef __ARM64_KVM_HOST_H__
 #define __ARM64_KVM_HOST_H__
 
+#include <linux/arm-smccc.h>
 #include <linux/bitmap.h>
 #include <linux/types.h>
 #include <linux/jump_label.h>
@@ -479,18 +480,20 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
 
-u64 __kvm_call_hyp(void *hypfn, ...);
+u64 __kvm_call_hyp_init(phys_addr_t pgd_ptr,
+			unsigned long hyp_stack_ptr,
+			unsigned long vector_ptr,
+			unsigned long tpidr_el2);
 
-#define kvm_call_hyp_nvhe(f, ...)					\
-	do {								\
-		DECLARE_KVM_NVHE_SYM(f);				\
-		__kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__);	\
-	} while(0)
-
-#define kvm_call_hyp_nvhe_ret(f, ...)					\
+#define kvm_call_hyp_nvhe(f, ...)						\
 	({								\
-		DECLARE_KVM_NVHE_SYM(f);				\
-		__kvm_call_hyp(kvm_ksym_ref_nvhe(f), ##__VA_ARGS__);	\
+		struct arm_smccc_res res;				\
+									\
+		arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f),		\
+				  ##__VA_ARGS__, &res);			\
+		WARN_ON(res.a0 != SMCCC_RET_SUCCESS);			\
+									\
+		res.a1;							\
 	})
 
 /*
@@ -516,7 +519,7 @@ u64 __kvm_call_hyp(void *hypfn, ...);
 			ret = f(__VA_ARGS__);				\
 			isb();						\
 		} else {						\
-			ret = kvm_call_hyp_nvhe_ret(f, ##__VA_ARGS__);	\
+			ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__);	\
 		}							\
 									\
 		ret;							\
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 6b7180072c8d..49aa08bd26de 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1288,7 +1288,7 @@ static void cpu_init_hyp_mode(void)
 	 * cpus_have_const_cap() wrapper.
 	 */
 	BUG_ON(!system_capabilities_finalized());
-	__kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+	__kvm_call_hyp_init(pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
 
 	/*
 	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 3c79a1124af2..12aa426f7559 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -11,24 +11,12 @@
 #include <asm/cpufeature.h>
 
 /*
- * u64 __kvm_call_hyp(void *hypfn, ...);
- *
- * This is not really a variadic function in the classic C-way and care must
- * be taken when calling this to ensure parameters are passed in registers
- * only, since the stack will change between the caller and the callee.
- *
- * Call the function with the first argument containing a pointer to the
- * function you wish to call in Hyp mode, and subsequent arguments will be
- * passed as x0, x1, and x2 (a maximum of 3 arguments in addition to the
- * function pointer can be passed).  The function being called must be mapped
- * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c).  Return values are
- * passed in x0.
- *
- * A function pointer with a value less than 0xfff has a special meaning,
- * and is used to implement hyp stubs in the same way as in
- * arch/arm64/kernel/hyp_stub.S.
+ * u64 __kvm_call_hyp_init(phys_addr_t pgd_ptr,
+ * 			   unsigned long hyp_stack_ptr,
+ * 			   unsigned long vector_ptr,
+ * 			   unsigned long tpidr_el2);
  */
-SYM_FUNC_START(__kvm_call_hyp)
+SYM_FUNC_START(__kvm_call_hyp_init)
 	hvc	#0
 	ret
-SYM_FUNC_END(__kvm_call_hyp)
+SYM_FUNC_END(__kvm_call_hyp_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 570c3896f42e..41aaf038599a 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -12,25 +12,106 @@
 #include <asm/kvm_hyp.h>
 #include <asm/kvm_mmu.h>
 
-typedef unsigned long (*hypcall_fn_t)
-	(unsigned long, unsigned long, unsigned long);
+#include <kvm/arm_hypercalls.h>
+
+static void handle_host_hcall(unsigned long func_id,
+			      struct kvm_cpu_context *host_ctxt)
+{
+	unsigned long ret = 0;
+
+	switch (func_id) {
+	case KVM_HOST_SMCCC_FUNC(__kvm_vcpu_run): {
+		unsigned long r1 = host_ctxt->regs.regs[1];
+		struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
+
+		ret = __kvm_vcpu_run(vcpu);
+		break;
+	}
+	case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
+		__kvm_flush_vm_context();
+		break;
+	case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid_ipa): {
+		unsigned long r1 = host_ctxt->regs.regs[1];
+		struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+		phys_addr_t ipa = host_ctxt->regs.regs[2];
+		int level = host_ctxt->regs.regs[3];
+
+		__kvm_tlb_flush_vmid_ipa(mmu, ipa, level);
+		break;
+	}
+	case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
+		unsigned long r1 = host_ctxt->regs.regs[1];
+		struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+		__kvm_tlb_flush_vmid(mmu);
+		break;
+	}
+	case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
+		unsigned long r1 = host_ctxt->regs.regs[1];
+		struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
+
+		__kvm_tlb_flush_local_vmid(mmu);
+		break;
+	}
+	case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
+		u64 cntvoff = host_ctxt->regs.regs[1];
+
+		__kvm_timer_set_cntvoff(cntvoff);
+		break;
+	}
+	case KVM_HOST_SMCCC_FUNC(__kvm_enable_ssbs):
+		__kvm_enable_ssbs();
+		break;
+	case KVM_HOST_SMCCC_FUNC(__vgic_v3_get_ich_vtr_el2):
+		ret = __vgic_v3_get_ich_vtr_el2();
+		break;
+	case KVM_HOST_SMCCC_FUNC(__vgic_v3_read_vmcr):
+		ret = __vgic_v3_read_vmcr();
+		break;
+	case KVM_HOST_SMCCC_FUNC(__vgic_v3_write_vmcr): {
+		u32 vmcr = host_ctxt->regs.regs[1];
+
+		__vgic_v3_write_vmcr(vmcr);
+		break;
+	}
+	case KVM_HOST_SMCCC_FUNC(__vgic_v3_init_lrs):
+		__vgic_v3_init_lrs();
+		break;
+	case KVM_HOST_SMCCC_FUNC(__kvm_get_mdcr_el2):
+		ret = __kvm_get_mdcr_el2();
+		break;
+	case KVM_HOST_SMCCC_FUNC(__vgic_v3_save_aprs): {
+		unsigned long r1 = host_ctxt->regs.regs[1];
+		struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+		__vgic_v3_save_aprs(cpu_if);
+		break;
+	}
+	case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
+		unsigned long r1 = host_ctxt->regs.regs[1];
+		struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
+
+		__vgic_v3_restore_aprs(cpu_if);
+		break;
+	}
+	default:
+		/* Invalid host HVC. */
+		host_ctxt->regs.regs[0] = SMCCC_RET_NOT_SUPPORTED;
+		return;
+	}
+
+	host_ctxt->regs.regs[0] = SMCCC_RET_SUCCESS;
+	host_ctxt->regs.regs[1] = ret;
+}
 
 void handle_trap(struct kvm_cpu_context *host_ctxt)
 {
 	u64 esr = read_sysreg_el2(SYS_ESR);
-	hypcall_fn_t func;
-	unsigned long ret;
+	unsigned long func_id;
 
 	if (ESR_ELx_EC(esr) != ESR_ELx_EC_HVC64)
 		hyp_panic();
 
-	/*
-	 * __kvm_call_hyp takes a pointer in the host address space and
-	 * up to three arguments.
-	 */
-	func = (hypcall_fn_t)kern_hyp_va(host_ctxt->regs.regs[0]);
-	ret = func(host_ctxt->regs.regs[1],
-		   host_ctxt->regs.regs[2],
-		   host_ctxt->regs.regs[3]);
-	host_ctxt->regs.regs[0] = ret;
+	func_id = host_ctxt->regs.regs[0];
+	handle_host_hcall(func_id, host_ctxt);
 }

From 04e4caa8d355d19fd7d26734bd2b9e3b563bb22a Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:42 +0100
Subject: [PATCH 39/87] KVM: arm64: nVHE: Migrate hyp-init to SMCCC

To complete the transition to SMCCC, the hyp initialization is given a
function ID. This looks neater than comparing the hyp stub function IDs
to the page table physical address.

Some care is taken to only clobber x0-3 before the host context is saved
as only those registers can be clobbered accoring to SMCCC. Fortunately,
only a few acrobatics are needed. The possible new tpidr_el2 is moved to
the argument in x2 so that it can be stashed in tpidr_el2 early to free
up a scratch register. The page table configuration then makes use of
x0-2.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-19-ascull@google.com
---
 arch/arm64/include/asm/kvm_host.h  |  5 ---
 arch/arm64/kvm/Makefile            |  2 +-
 arch/arm64/kvm/arm.c               |  5 ++-
 arch/arm64/kvm/hyp.S               | 22 ----------
 arch/arm64/kvm/hyp/nvhe/hyp-init.S | 70 +++++++++++++++++-------------
 5 files changed, 45 insertions(+), 59 deletions(-)
 delete mode 100644 arch/arm64/kvm/hyp.S

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 82c941cf8890..ef0325c42ca0 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -480,11 +480,6 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_arm_halt_guest(struct kvm *kvm);
 void kvm_arm_resume_guest(struct kvm *kvm);
 
-u64 __kvm_call_hyp_init(phys_addr_t pgd_ptr,
-			unsigned long hyp_stack_ptr,
-			unsigned long vector_ptr,
-			unsigned long tpidr_el2);
-
 #define kvm_call_hyp_nvhe(f, ...)						\
 	({								\
 		struct arm_smccc_res res;				\
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 99977c1972cc..1504c81fbf5d 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_KVM) += hyp/
 kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
 	 $(KVM)/vfio.o $(KVM)/irqchip.o \
 	 arm.o mmu.o mmio.o psci.o perf.o hypercalls.o pvtime.o \
-	 inject_fault.o regmap.o va_layout.o hyp.o handle_exit.o \
+	 inject_fault.o regmap.o va_layout.o handle_exit.o \
 	 guest.o debug.o reset.o sys_regs.o \
 	 vgic-sys-reg-v3.o fpsimd.o pmu.o \
 	 aarch32.o arch_timer.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 49aa08bd26de..c074d9824d54 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1264,6 +1264,7 @@ static void cpu_init_hyp_mode(void)
 	unsigned long hyp_stack_ptr;
 	unsigned long vector_ptr;
 	unsigned long tpidr_el2;
+	struct arm_smccc_res res;
 
 	/* Switch from the HYP stub to our own HYP init vector */
 	__hyp_set_vectors(kvm_get_idmap_vector());
@@ -1288,7 +1289,9 @@ static void cpu_init_hyp_mode(void)
 	 * cpus_have_const_cap() wrapper.
 	 */
 	BUG_ON(!system_capabilities_finalized());
-	__kvm_call_hyp_init(pgd_ptr, hyp_stack_ptr, vector_ptr, tpidr_el2);
+	arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init),
+			  pgd_ptr, tpidr_el2, hyp_stack_ptr, vector_ptr, &res);
+	WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
 
 	/*
 	 * Disabling SSBD on a non-VHE system requires us to enable SSBS
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
deleted file mode 100644
index 12aa426f7559..000000000000
--- a/arch/arm64/kvm/hyp.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012,2013 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- */
-
-#include <linux/linkage.h>
-
-#include <asm/alternative.h>
-#include <asm/assembler.h>
-#include <asm/cpufeature.h>
-
-/*
- * u64 __kvm_call_hyp_init(phys_addr_t pgd_ptr,
- * 			   unsigned long hyp_stack_ptr,
- * 			   unsigned long vector_ptr,
- * 			   unsigned long tpidr_el2);
- */
-SYM_FUNC_START(__kvm_call_hyp_init)
-	hvc	#0
-	ret
-SYM_FUNC_END(__kvm_call_hyp_init)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index abe885e26fe2..47224dc62c51 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -4,11 +4,13 @@
  * Author: Marc Zyngier <marc.zyngier@arm.com>
  */
 
+#include <linux/arm-smccc.h>
 #include <linux/linkage.h>
 
 #include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
 #include <asm/kvm_mmu.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/sysreg.h>
@@ -44,27 +46,37 @@ __invalid:
 	b	.
 
 	/*
-	 * x0: HYP pgd
-	 * x1: HYP stack
-	 * x2: HYP vectors
-	 * x3: per-CPU offset
+	 * x0: SMCCC function ID
+	 * x1: HYP pgd
+	 * x2: per-CPU offset
+	 * x3: HYP stack
+	 * x4: HYP vectors
 	 */
 __do_hyp_init:
 	/* Check for a stub HVC call */
 	cmp	x0, #HVC_STUB_HCALL_NR
 	b.lo	__kvm_handle_stub_hvc
 
-	phys_to_ttbr x4, x0
-alternative_if ARM64_HAS_CNP
-	orr	x4, x4, #TTBR_CNP_BIT
-alternative_else_nop_endif
-	msr	ttbr0_el2, x4
+	/* Set tpidr_el2 for use by HYP to free a register */
+	msr	tpidr_el2, x2
 
-	mrs	x4, tcr_el1
-	mov_q	x5, TCR_EL2_MASK
-	and	x4, x4, x5
-	mov	x5, #TCR_EL2_RES1
-	orr	x4, x4, x5
+	mov	x2, #KVM_HOST_SMCCC_FUNC(__kvm_hyp_init)
+	cmp	x0, x2
+	b.eq	1f
+	mov	x0, #SMCCC_RET_NOT_SUPPORTED
+	eret
+
+1:	phys_to_ttbr x0, x1
+alternative_if ARM64_HAS_CNP
+	orr	x0, x0, #TTBR_CNP_BIT
+alternative_else_nop_endif
+	msr	ttbr0_el2, x0
+
+	mrs	x0, tcr_el1
+	mov_q	x1, TCR_EL2_MASK
+	and	x0, x0, x1
+	mov	x1, #TCR_EL2_RES1
+	orr	x0, x0, x1
 
 	/*
 	 * The ID map may be configured to use an extended virtual address
@@ -80,18 +92,18 @@ alternative_else_nop_endif
 	 *
 	 * So use the same T0SZ value we use for the ID map.
 	 */
-	ldr_l	x5, idmap_t0sz
-	bfi	x4, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
+	ldr_l	x1, idmap_t0sz
+	bfi	x0, x1, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH
 
 	/*
 	 * Set the PS bits in TCR_EL2.
 	 */
-	tcr_compute_pa_size x4, #TCR_EL2_PS_SHIFT, x5, x6
+	tcr_compute_pa_size x0, #TCR_EL2_PS_SHIFT, x1, x2
 
-	msr	tcr_el2, x4
+	msr	tcr_el2, x0
 
-	mrs	x4, mair_el1
-	msr	mair_el2, x4
+	mrs	x0, mair_el1
+	msr	mair_el2, x0
 	isb
 
 	/* Invalidate the stale TLBs from Bootloader */
@@ -103,24 +115,22 @@ alternative_else_nop_endif
 	 * as well as the EE bit on BE. Drop the A flag since the compiler
 	 * is allowed to generate unaligned accesses.
 	 */
-	mov_q	x4, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
-CPU_BE(	orr	x4, x4, #SCTLR_ELx_EE)
+	mov_q	x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A))
+CPU_BE(	orr	x0, x0, #SCTLR_ELx_EE)
 alternative_if ARM64_HAS_ADDRESS_AUTH
-	mov_q	x5, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
+	mov_q	x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
 		     SCTLR_ELx_ENDA | SCTLR_ELx_ENDB)
-	orr	x4, x4, x5
+	orr	x0, x0, x1
 alternative_else_nop_endif
-	msr	sctlr_el2, x4
+	msr	sctlr_el2, x0
 	isb
 
 	/* Set the stack and new vectors */
-	mov	sp, x1
-	msr	vbar_el2, x2
-
-	/* Set tpidr_el2 for use by HYP */
-	msr	tpidr_el2, x3
+	mov	sp, x3
+	msr	vbar_el2, x4
 
 	/* Hello, World! */
+	mov	x0, #SMCCC_RET_SUCCESS
 	eret
 SYM_CODE_END(__kvm_hyp_init)
 

From a071261d93184f43b7dede7516b6c449c4ed5770 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Tue, 15 Sep 2020 11:46:43 +0100
Subject: [PATCH 40/87] KVM: arm64: nVHE: Fix pointers during SMCCC convertion

The host need not concern itself with the pointer differences for the
hyp interfaces that are shared between VHE and nVHE so leave it to the
hyp to handle.

As the SMCCC function IDs are converted into function calls, it is a
suitable place to also convert any pointer arguments into hyp pointers.
This, additionally, eases the reuse of the handlers in different
contexts.

Signed-off-by: Andrew Scull <ascull@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915104643.2543892-20-ascull@google.com
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 12 ++++++------
 arch/arm64/kvm/hyp/nvhe/switch.c   |  2 --
 arch/arm64/kvm/hyp/nvhe/tlb.c      |  2 --
 arch/arm64/kvm/vgic/vgic-v3.c      |  4 ++--
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 41aaf038599a..e2eafe2c93af 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -24,7 +24,7 @@ static void handle_host_hcall(unsigned long func_id,
 		unsigned long r1 = host_ctxt->regs.regs[1];
 		struct kvm_vcpu *vcpu = (struct kvm_vcpu *)r1;
 
-		ret = __kvm_vcpu_run(vcpu);
+		ret = __kvm_vcpu_run(kern_hyp_va(vcpu));
 		break;
 	}
 	case KVM_HOST_SMCCC_FUNC(__kvm_flush_vm_context):
@@ -36,21 +36,21 @@ static void handle_host_hcall(unsigned long func_id,
 		phys_addr_t ipa = host_ctxt->regs.regs[2];
 		int level = host_ctxt->regs.regs[3];
 
-		__kvm_tlb_flush_vmid_ipa(mmu, ipa, level);
+		__kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level);
 		break;
 	}
 	case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_vmid): {
 		unsigned long r1 = host_ctxt->regs.regs[1];
 		struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
 
-		__kvm_tlb_flush_vmid(mmu);
+		__kvm_tlb_flush_vmid(kern_hyp_va(mmu));
 		break;
 	}
 	case KVM_HOST_SMCCC_FUNC(__kvm_tlb_flush_local_vmid): {
 		unsigned long r1 = host_ctxt->regs.regs[1];
 		struct kvm_s2_mmu *mmu = (struct kvm_s2_mmu *)r1;
 
-		__kvm_tlb_flush_local_vmid(mmu);
+		__kvm_tlb_flush_local_vmid(kern_hyp_va(mmu));
 		break;
 	}
 	case KVM_HOST_SMCCC_FUNC(__kvm_timer_set_cntvoff): {
@@ -84,14 +84,14 @@ static void handle_host_hcall(unsigned long func_id,
 		unsigned long r1 = host_ctxt->regs.regs[1];
 		struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
 
-		__vgic_v3_save_aprs(cpu_if);
+		__vgic_v3_save_aprs(kern_hyp_va(cpu_if));
 		break;
 	}
 	case KVM_HOST_SMCCC_FUNC(__vgic_v3_restore_aprs): {
 		unsigned long r1 = host_ctxt->regs.regs[1];
 		struct vgic_v3_cpu_if *cpu_if = (struct vgic_v3_cpu_if *)r1;
 
-		__vgic_v3_restore_aprs(cpu_if);
+		__vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
 		break;
 	}
 	default:
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 29febf9a93f2..a29f247f35e3 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -176,8 +176,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 		pmr_sync();
 	}
 
-	vcpu = kern_hyp_va(vcpu);
-
 	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
 	host_ctxt->__hyp_running_vcpu = vcpu;
 	guest_ctxt = &vcpu->arch.ctxt;
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 69eae608d670..544bca3072b7 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -54,7 +54,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	dsb(ishst);
 
 	/* Switch to requested VMID */
-	mmu = kern_hyp_va(mmu);
 	__tlb_switch_to_guest(mmu, &cxt);
 
 	/*
@@ -108,7 +107,6 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
 	dsb(ishst);
 
 	/* Switch to requested VMID */
-	mmu = kern_hyp_va(mmu);
 	__tlb_switch_to_guest(mmu, &cxt);
 
 	__tlbi(vmalls12e1is);
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 76e2d85789ed..9cdf39a94a63 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -662,7 +662,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
 	if (likely(cpu_if->vgic_sre))
 		kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
 
-	kvm_call_hyp(__vgic_v3_restore_aprs, kern_hyp_va(cpu_if));
+	kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
 
 	if (has_vhe())
 		__vgic_v3_activate_traps(cpu_if);
@@ -686,7 +686,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
 
 	vgic_v3_vmcr_sync(vcpu);
 
-	kvm_call_hyp(__vgic_v3_save_aprs, kern_hyp_va(cpu_if));
+	kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
 
 	if (has_vhe())
 		__vgic_v3_deactivate_traps(cpu_if);

From 523b3999e5f620cb5ccce6a7ca2780a4cab579a2 Mon Sep 17 00:00:00 2001
From: Alexandru Elisei <alexandru.elisei@arm.com>
Date: Thu, 10 Sep 2020 14:33:51 +0100
Subject: [PATCH 41/87] KVM: arm64: Try PMD block mappings if PUD mappings are
 not supported

When userspace uses hugetlbfs for the VM memory, user_mem_abort() tries to
use the same block size to map the faulting IPA in stage 2. If stage 2
cannot the same block mapping because the block size doesn't fit in the
memslot or the memslot is not properly aligned, user_mem_abort() will fall
back to a page mapping, regardless of the block size. We can do better for
PUD backed hugetlbfs by checking if a PMD block mapping is supported before
deciding to use a page.

vma_pagesize is an unsigned long, use 1UL instead of 1ULL when assigning
its value.

Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200910133351.118191-1-alexandru.elisei@arm.com
---
 arch/arm64/kvm/mmu.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 21b70abf65a7..852497bd6d1e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -779,16 +779,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	else
 		vma_shift = PAGE_SHIFT;
 
-	vma_pagesize = 1ULL << vma_shift;
 	if (logging_active ||
-	    (vma->vm_flags & VM_PFNMAP) ||
-	    !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+	    (vma->vm_flags & VM_PFNMAP)) {
 		force_pte = true;
-		vma_pagesize = PAGE_SIZE;
+		vma_shift = PAGE_SHIFT;
 	}
 
+	if (vma_shift == PUD_SHIFT &&
+	    !fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+	       vma_shift = PMD_SHIFT;
+
+	if (vma_shift == PMD_SHIFT &&
+	    !fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
+		force_pte = true;
+		vma_shift = PAGE_SHIFT;
+	}
+
+	vma_pagesize = 1UL << vma_shift;
 	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
-		fault_ipa &= huge_page_mask(hstate_vma(vma));
+		fault_ipa &= ~(vma_pagesize - 1);
 
 	gfn = fault_ipa >> PAGE_SHIFT;
 	mmap_read_unlock(current->mm);

From 8a4374f97deefb36f927c608af63dcc925521b3c Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Tue, 15 Sep 2020 15:21:52 +0800
Subject: [PATCH 42/87] KVM: arm64: Fix inject_fault.c kernel-doc warnings

Fix kernel-doc warnings.
arch/arm64/kvm/inject_fault.c:210: warning: Function parameter or member
'vcpu' not described in 'kvm_inject_undefined'

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1600154512-44624-1-git-send-email-tiantao6@hisilicon.com
---
 arch/arm64/kvm/inject_fault.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index ebfdfc27b2bd..34a96ab244fa 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -202,6 +202,7 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
 
 /**
  * kvm_inject_undefined - inject an undefined instruction into the guest
+ * @vcpu: The vCPU in which to inject the exception
  *
  * It is assumed that this code is called from the VCPU thread and that the
  * VCPU therefore is not currently executing guest code.

From cb62e0b5c8db778cd29e63c2f844f36caf6859ed Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Wed, 16 Sep 2020 10:50:23 +0800
Subject: [PATCH 43/87] KVM: arm64: vgic-debug: Convert to use
 DEFINE_SEQ_ATTRIBUTE macro

Use DEFINE_SEQ_ATTRIBUTE macro to simplify the code.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200916025023.3992679-1-liushixin2@huawei.com
---
 arch/arm64/kvm/vgic/vgic-debug.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index b13a9e3f99dd..f38c40a76251 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -260,34 +260,14 @@ static int vgic_debug_show(struct seq_file *s, void *v)
 	return 0;
 }
 
-static const struct seq_operations vgic_debug_seq_ops = {
+static const struct seq_operations vgic_debug_sops = {
 	.start = vgic_debug_start,
 	.next  = vgic_debug_next,
 	.stop  = vgic_debug_stop,
 	.show  = vgic_debug_show
 };
 
-static int debug_open(struct inode *inode, struct file *file)
-{
-	int ret;
-	ret = seq_open(file, &vgic_debug_seq_ops);
-	if (!ret) {
-		struct seq_file *seq;
-		/* seq_open will have modified file->private_data */
-		seq = file->private_data;
-		seq->private = inode->i_private;
-	}
-
-	return ret;
-};
-
-static const struct file_operations vgic_debug_fops = {
-	.owner   = THIS_MODULE,
-	.open    = debug_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
+DEFINE_SEQ_ATTRIBUTE(vgic_debug);
 
 void vgic_debug_init(struct kvm *kvm)
 {

From ada329e6b5b406f33fae665e62caff7814409906 Mon Sep 17 00:00:00 2001
From: Alexandru Elisei <alexandru.elisei@arm.com>
Date: Tue, 15 Sep 2020 18:04:42 +0100
Subject: [PATCH 44/87] KVM: arm64: Do not flush memslot if FWB is supported

As a result of a KVM_SET_USER_MEMORY_REGION ioctl, KVM flushes the
dcache for the memslot being changed to ensure a consistent view of memory
between the host and the guest: the host runs with caches enabled, and
it is possible for the data written by the hypervisor to still be in the
caches, but the guest is running with stage 1 disabled, meaning data
accesses are to Device-nGnRnE memory, bypassing the caches entirely.

Flushing the dcache is not necessary when KVM enables FWB, because it
forces the guest to uses cacheable memory accesses.

The current behaviour does not change, as the dcache flush helpers execute
the cache operation only if FWB is not enabled, but walking the stage 2
table is avoided.

Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200915170442.131635-1-alexandru.elisei@arm.com
---
 arch/arm64/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 852497bd6d1e..4d68c160a7b5 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1344,7 +1344,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	spin_lock(&kvm->mmu_lock);
 	if (ret)
 		unmap_stage2_range(&kvm->arch.mmu, mem->guest_phys_addr, mem->memory_size);
-	else
+	else if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
 		stage2_flush_memslot(kvm, memslot);
 	spin_unlock(&kvm->mmu_lock);
 out:

From c9c0279cc02b4e161686de7ccd1973357f29db8c Mon Sep 17 00:00:00 2001
From: Xiaofei Tan <tanxiaofei@huawei.com>
Date: Thu, 17 Sep 2020 09:47:49 +0800
Subject: [PATCH 45/87] KVM: arm64: Fix doc warnings in mmu code

Fix following warnings caused by mismatch bewteen function parameters
and comments.
arch/arm64/kvm/mmu.c:128: warning: Function parameter or member 'mmu' not described in '__unmap_stage2_range'
arch/arm64/kvm/mmu.c:128: warning: Function parameter or member 'may_block' not described in '__unmap_stage2_range'
arch/arm64/kvm/mmu.c:128: warning: Excess function parameter 'kvm' description in '__unmap_stage2_range'
arch/arm64/kvm/mmu.c:499: warning: Function parameter or member 'writable' not described in 'kvm_phys_addr_ioremap'
arch/arm64/kvm/mmu.c:538: warning: Function parameter or member 'mmu' not described in 'stage2_wp_range'
arch/arm64/kvm/mmu.c:538: warning: Excess function parameter 'kvm' description in 'stage2_wp_range'

Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/1600307269-50957-1-git-send-email-tanxiaofei@huawei.com
---
 arch/arm64/kvm/mmu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 4d68c160a7b5..c5c26a9cb85b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -114,9 +114,10 @@ static bool kvm_is_device_pfn(unsigned long pfn)
  */
 /**
  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
- * @kvm:   The VM pointer
+ * @mmu:   The KVM stage-2 MMU pointer
  * @start: The intermediate physical base address of the range to unmap
  * @size:  The size of the area to unmap
+ * @may_block: Whether or not we are permitted to block
  *
  * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
  * be called while holding mmu_lock (unless for freeing the stage2 pgd before
@@ -493,6 +494,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
  * @guest_ipa:	The IPA at which to insert the mapping
  * @pa:		The physical address of the device
  * @size:	The size of the mapping
+ * @writable:   Whether or not to create a writable mapping
  */
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 			  phys_addr_t pa, unsigned long size, bool writable)
@@ -530,7 +532,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 
 /**
  * stage2_wp_range() - write protect stage2 memory region range
- * @kvm:	The KVM pointer
+ * @mmu:        The KVM stage-2 MMU pointer
  * @addr:	Start address of range
  * @end:	End address of range
  */

From b11483ef5a502663732c6ca1b58d14ff9eedd6f7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 16 Jul 2020 17:11:08 +0100
Subject: [PATCH 46/87] arm64: Make use of ARCH_WORKAROUND_1 even when KVM is
 not enabled

We seem to be pretending that we don't have any firmware mitigation
when KVM is not compiled in, which is not quite expected.

Bring back the mitigation in this case.

Fixes: 4db61fef16a1 ("arm64: kvm: Modernize __smccc_workaround_1_smc_start annotations")
Cc: <stable@vger.kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/cpu_errata.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index c332d49780dc..88966496806a 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -234,14 +234,17 @@ static int detect_harden_bp_fw(void)
 		smccc_end = NULL;
 		break;
 
-#if IS_ENABLED(CONFIG_KVM)
 	case SMCCC_CONDUIT_SMC:
 		cb = call_smc_arch_workaround_1;
+#if IS_ENABLED(CONFIG_KVM)
 		smccc_start = __smccc_workaround_1_smc;
 		smccc_end = __smccc_workaround_1_smc +
 			__SMCCC_WORKAROUND_1_SMC_SZ;
-		break;
+#else
+		smccc_start = NULL;
+		smccc_end = NULL;
 #endif
+		break;
 
 	default:
 		return -1;

From 18fce56134c987e5b4eceddafdbe4b00c07e2ae1 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 16 Jul 2020 17:11:09 +0100
Subject: [PATCH 47/87] arm64: Run ARCH_WORKAROUND_1 enabling code on all CPUs

Commit 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack
thereof") changed the way we deal with ARCH_WORKAROUND_1, by moving most
of the enabling code to the .matches() callback.

This has the unfortunate effect that the workaround gets only enabled on
the first affected CPU, and no other.

In order to address this, forcefully call the .matches() callback from a
.cpu_enable() callback, which brings us back to the original behaviour.

Fixes: 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack thereof")
Cc: <stable@vger.kernel.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/cpu_errata.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 88966496806a..3fe64bf5a58d 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -599,6 +599,12 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
 	return (need_wa > 0);
 }
 
+static void
+cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap)
+{
+	cap->matches(cap, SCOPE_LOCAL_CPU);
+}
+
 static const __maybe_unused struct midr_range tx2_family_cpus[] = {
 	MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
 	MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
@@ -890,9 +896,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	},
 #endif
 	{
+		.desc = "Branch predictor hardening",
 		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
 		.matches = check_branch_predictor,
+		.cpu_enable = cpu_enable_branch_predictor_hardening,
 	},
 #ifdef CONFIG_RANDOMIZE_BASE
 	{

From 42223fb100b43430daf0c396701cd75b2579ddb7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 12 Mar 2020 17:27:36 +0000
Subject: [PATCH 48/87] KVM: arm64: Refactor PMU attribute error handling

The PMU emulation error handling is pretty messy when dealing with
attributes. Let's refactor it so that we have less duplication,
and that it is easy to extend later on.

A functional change is that kvm_arm_pmu_v3_init() used to return
-ENXIO when the PMU feature wasn't set. The error is now reported
as -ENODEV, matching the documentation. -ENXIO is still returned
when the interrupt isn't properly configured.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/pmu-emul.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index f0d0312c0a55..363f188fb707 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -735,15 +735,6 @@ int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
 
 static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
 {
-	if (!kvm_arm_support_pmu_v3())
-		return -ENODEV;
-
-	if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-		return -ENXIO;
-
-	if (vcpu->arch.pmu.created)
-		return -EBUSY;
-
 	if (irqchip_in_kernel(vcpu->kvm)) {
 		int ret;
 
@@ -796,6 +787,13 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
 
 int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 {
+	if (!kvm_arm_support_pmu_v3() ||
+	    !test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
+		return -ENODEV;
+
+	if (vcpu->arch.pmu.created)
+		return -EBUSY;
+
 	switch (attr->attr) {
 	case KVM_ARM_VCPU_PMU_V3_IRQ: {
 		int __user *uaddr = (int __user *)(long)attr->addr;
@@ -804,9 +802,6 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 		if (!irqchip_in_kernel(vcpu->kvm))
 			return -EINVAL;
 
-		if (!test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
-			return -ENODEV;
-
 		if (get_user(irq, uaddr))
 			return -EFAULT;
 

From fd65a3b5f855c37167890d86e261a20bab1a14a4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 17 Mar 2020 11:11:56 +0000
Subject: [PATCH 49/87] KVM: arm64: Use event mask matching architecture
 revision

The PMU code suffers from a small defect where we assume that the event
number provided by the guest is always 16 bit wide, even if the CPU only
implements the ARMv8.0 architecture. This isn't really problematic in
the sense that the event number ends up in a system register, cropping
it to the right width, but still this needs fixing.

In order to make it work, let's probe the version of the PMU that the
guest is going to use. This is done by temporarily creating a kernel
event and looking at the PMUVer field that has been saved at probe time
in the associated arm_pmu structure. This in turn gets saved in the kvm
structure, and subsequently used to compute the event mask that gets
used throughout the PMU code.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h |  2 +
 arch/arm64/kvm/pmu-emul.c         | 80 +++++++++++++++++++++++++++++--
 2 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e52c927aade5..33e0f0a64c42 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -110,6 +110,8 @@ struct kvm_arch {
 	 * supported.
 	 */
 	bool return_nisv_io_abort_to_user;
+
+	unsigned int pmuver;
 };
 
 struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 363f188fb707..10d3860a9c0a 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -20,6 +20,21 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc *pmc);
 
 #define PERF_ATTR_CFG1_KVM_PMU_CHAINED 0x1
 
+static u32 kvm_pmu_event_mask(struct kvm *kvm)
+{
+	switch (kvm->arch.pmuver) {
+	case 1:			/* ARMv8.0 */
+		return GENMASK(9, 0);
+	case 4:			/* ARMv8.1 */
+	case 5:			/* ARMv8.4 */
+	case 6:			/* ARMv8.5 */
+		return GENMASK(15, 0);
+	default:		/* Shouldn't be here, just for sanity */
+		WARN_ONCE(1, "Unknown PMU version %d\n", kvm->arch.pmuver);
+		return 0;
+	}
+}
+
 /**
  * kvm_pmu_idx_is_64bit - determine if select_idx is a 64bit counter
  * @vcpu: The vcpu pointer
@@ -100,7 +115,7 @@ static bool kvm_pmu_idx_has_chain_evtype(struct kvm_vcpu *vcpu, u64 select_idx)
 		return false;
 
 	reg = PMEVTYPER0_EL0 + select_idx;
-	eventsel = __vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_EVENT;
+	eventsel = __vcpu_sys_reg(vcpu, reg) & kvm_pmu_event_mask(vcpu->kvm);
 
 	return eventsel == ARMV8_PMUV3_PERFCTR_CHAIN;
 }
@@ -495,7 +510,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
 
 		/* PMSWINC only applies to ... SW_INC! */
 		type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
-		type &= ARMV8_PMU_EVTYPE_EVENT;
+		type &= kvm_pmu_event_mask(vcpu->kvm);
 		if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
 			continue;
 
@@ -578,7 +593,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
 	data = __vcpu_sys_reg(vcpu, reg);
 
 	kvm_pmu_stop_counter(vcpu, pmc);
-	eventsel = data & ARMV8_PMU_EVTYPE_EVENT;
+	eventsel = data & kvm_pmu_event_mask(vcpu->kvm);;
 
 	/* Software increment event does't need to be backed by a perf event */
 	if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
@@ -679,17 +694,66 @@ static void kvm_pmu_update_pmc_chained(struct kvm_vcpu *vcpu, u64 select_idx)
 void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
 				    u64 select_idx)
 {
-	u64 reg, event_type = data & ARMV8_PMU_EVTYPE_MASK;
+	u64 reg, mask;
+
+	mask  =  ARMV8_PMU_EVTYPE_MASK;
+	mask &= ~ARMV8_PMU_EVTYPE_EVENT;
+	mask |= kvm_pmu_event_mask(vcpu->kvm);
 
 	reg = (select_idx == ARMV8_PMU_CYCLE_IDX)
 	      ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + select_idx;
 
-	__vcpu_sys_reg(vcpu, reg) = event_type;
+	__vcpu_sys_reg(vcpu, reg) = data & mask;
 
 	kvm_pmu_update_pmc_chained(vcpu, select_idx);
 	kvm_pmu_create_perf_event(vcpu, select_idx);
 }
 
+static int kvm_pmu_probe_pmuver(void)
+{
+	struct perf_event_attr attr = { };
+	struct perf_event *event;
+	struct arm_pmu *pmu;
+	int pmuver = 0xf;
+
+	/*
+	 * Create a dummy event that only counts user cycles. As we'll never
+	 * leave this function with the event being live, it will never
+	 * count anything. But it allows us to probe some of the PMU
+	 * details. Yes, this is terrible.
+	 */
+	attr.type = PERF_TYPE_RAW;
+	attr.size = sizeof(attr);
+	attr.pinned = 1;
+	attr.disabled = 0;
+	attr.exclude_user = 0;
+	attr.exclude_kernel = 1;
+	attr.exclude_hv = 1;
+	attr.exclude_host = 1;
+	attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+	attr.sample_period = GENMASK(63, 0);
+
+	event = perf_event_create_kernel_counter(&attr, -1, current,
+						 kvm_pmu_perf_overflow, &attr);
+
+	if (IS_ERR(event)) {
+		pr_err_once("kvm: pmu event creation failed %ld\n",
+			    PTR_ERR(event));
+		return 0xf;
+	}
+
+	if (event->pmu) {
+		pmu = to_arm_pmu(event->pmu);
+		if (pmu->pmuver)
+			pmuver = pmu->pmuver;
+	}
+
+	perf_event_disable(event);
+	perf_event_release_kernel(event);
+
+	return pmuver;
+}
+
 bool kvm_arm_support_pmu_v3(void)
 {
 	/*
@@ -794,6 +858,12 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 	if (vcpu->arch.pmu.created)
 		return -EBUSY;
 
+	if (!vcpu->kvm->arch.pmuver)
+		vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver();
+
+	if (vcpu->kvm->arch.pmuver == 0xf)
+		return -ENODEV;
+
 	switch (attr->attr) {
 	case KVM_ARM_VCPU_PMU_V3_IRQ: {
 		int __user *uaddr = (int __user *)(long)attr->addr;

From d7eec2360e389cc877a76c2b098f7b745007d2b2 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 12 Feb 2020 11:31:02 +0000
Subject: [PATCH 50/87] KVM: arm64: Add PMU event filtering infrastructure

It can be desirable to expose a PMU to a guest, and yet not want the
guest to be able to count some of the implemented events (because this
would give information on shared resources, for example.

For this, let's extend the PMUv3 device API, and offer a way to setup a
bitmap of the allowed events (the default being no bitmap, and thus no
filtering).

Userspace can thus allow/deny ranges of event. The default policy
depends on the "polarity" of the first filter setup (default deny if the
filter allows events, and default allow if the filter denies events).
This allows to setup exactly what is allowed for a given guest.

Note that although the ioctl is per-vcpu, the map of allowed events is
global to the VM (it can be setup from any vcpu until the vcpu PMU is
initialized).

Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h |  5 +++
 arch/arm64/include/uapi/asm/kvm.h | 16 +++++++
 arch/arm64/kvm/arm.c              |  2 +
 arch/arm64/kvm/pmu-emul.c         | 69 ++++++++++++++++++++++++++++---
 4 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 33e0f0a64c42..0384defc2a01 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -111,6 +111,11 @@ struct kvm_arch {
 	 */
 	bool return_nisv_io_abort_to_user;
 
+	/*
+	 * VM-wide PMU filter, implemented as a bitmap and big enough for
+	 * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
+	 */
+	unsigned long *pmu_filter;
 	unsigned int pmuver;
 };
 
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index ba85bb23f060..7b1511d6ce44 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -159,6 +159,21 @@ struct kvm_sync_regs {
 struct kvm_arch_memory_slot {
 };
 
+/*
+ * PMU filter structure. Describe a range of events with a particular
+ * action. To be used with KVM_ARM_VCPU_PMU_V3_FILTER.
+ */
+struct kvm_pmu_event_filter {
+	__u16	base_event;
+	__u16	nevents;
+
+#define KVM_PMU_EVENT_ALLOW	0
+#define KVM_PMU_EVENT_DENY	1
+
+	__u8	action;
+	__u8	pad[3];
+};
+
 /* for KVM_GET/SET_VCPU_EVENTS */
 struct kvm_vcpu_events {
 	struct {
@@ -329,6 +344,7 @@ struct kvm_vcpu_events {
 #define KVM_ARM_VCPU_PMU_V3_CTRL	0
 #define   KVM_ARM_VCPU_PMU_V3_IRQ	0
 #define   KVM_ARM_VCPU_PMU_V3_INIT	1
+#define   KVM_ARM_VCPU_PMU_V3_FILTER	2
 #define KVM_ARM_VCPU_TIMER_CTRL		1
 #define   KVM_ARM_VCPU_TIMER_IRQ_VTIMER		0
 #define   KVM_ARM_VCPU_TIMER_IRQ_PTIMER		1
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46dc3d75cf13..427848ff6f20 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -145,6 +145,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	int i;
 
+	bitmap_free(kvm->arch.pmu_filter);
+
 	kvm_vgic_destroy(kvm);
 
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 10d3860a9c0a..8cd365968af1 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -593,11 +593,21 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
 	data = __vcpu_sys_reg(vcpu, reg);
 
 	kvm_pmu_stop_counter(vcpu, pmc);
-	eventsel = data & kvm_pmu_event_mask(vcpu->kvm);;
+	if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
+		eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
+	else
+		eventsel = data & kvm_pmu_event_mask(vcpu->kvm);
 
-	/* Software increment event does't need to be backed by a perf event */
-	if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR &&
-	    pmc->idx != ARMV8_PMU_CYCLE_IDX)
+	/* Software increment event doesn't need to be backed by a perf event */
+	if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR)
+		return;
+
+	/*
+	 * If we have a filter in place and that the event isn't allowed, do
+	 * not install a perf event either.
+	 */
+	if (vcpu->kvm->arch.pmu_filter &&
+	    !test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
 		return;
 
 	memset(&attr, 0, sizeof(struct perf_event_attr));
@@ -609,8 +619,7 @@ static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 select_idx)
 	attr.exclude_kernel = data & ARMV8_PMU_EXCLUDE_EL1 ? 1 : 0;
 	attr.exclude_hv = 1; /* Don't count EL2 events */
 	attr.exclude_host = 1; /* Don't count host events */
-	attr.config = (pmc->idx == ARMV8_PMU_CYCLE_IDX) ?
-		ARMV8_PMUV3_PERFCTR_CPU_CYCLES : eventsel;
+	attr.config = eventsel;
 
 	counter = kvm_pmu_get_pair_counter_value(vcpu, pmc);
 
@@ -889,6 +898,53 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 		vcpu->arch.pmu.irq_num = irq;
 		return 0;
 	}
+	case KVM_ARM_VCPU_PMU_V3_FILTER: {
+		struct kvm_pmu_event_filter __user *uaddr;
+		struct kvm_pmu_event_filter filter;
+		int nr_events;
+
+		nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1;
+
+		uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;
+
+		if (copy_from_user(&filter, uaddr, sizeof(filter)))
+			return -EFAULT;
+
+		if (((u32)filter.base_event + filter.nevents) > nr_events ||
+		    (filter.action != KVM_PMU_EVENT_ALLOW &&
+		     filter.action != KVM_PMU_EVENT_DENY))
+			return -EINVAL;
+
+		mutex_lock(&vcpu->kvm->lock);
+
+		if (!vcpu->kvm->arch.pmu_filter) {
+			vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL);
+			if (!vcpu->kvm->arch.pmu_filter) {
+				mutex_unlock(&vcpu->kvm->lock);
+				return -ENOMEM;
+			}
+
+			/*
+			 * The default depends on the first applied filter.
+			 * If it allows events, the default is to deny.
+			 * Conversely, if the first filter denies a set of
+			 * events, the default is to allow.
+			 */
+			if (filter.action == KVM_PMU_EVENT_ALLOW)
+				bitmap_zero(vcpu->kvm->arch.pmu_filter, nr_events);
+			else
+				bitmap_fill(vcpu->kvm->arch.pmu_filter, nr_events);
+		}
+
+		if (filter.action == KVM_PMU_EVENT_ALLOW)
+			bitmap_set(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+		else
+			bitmap_clear(vcpu->kvm->arch.pmu_filter, filter.base_event, filter.nevents);
+
+		mutex_unlock(&vcpu->kvm->lock);
+
+		return 0;
+	}
 	case KVM_ARM_VCPU_PMU_V3_INIT:
 		return kvm_arm_pmu_v3_init(vcpu);
 	}
@@ -925,6 +981,7 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
 	switch (attr->attr) {
 	case KVM_ARM_VCPU_PMU_V3_IRQ:
 	case KVM_ARM_VCPU_PMU_V3_INIT:
+	case KVM_ARM_VCPU_PMU_V3_FILTER:
 		if (kvm_arm_support_pmu_v3() &&
 		    test_bit(KVM_ARM_VCPU_PMU_V3, vcpu->arch.features))
 			return 0;

From 88865beca90621ae33217d5f23664d382f99205a Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 12 Mar 2020 16:11:24 +0000
Subject: [PATCH 51/87] KVM: arm64: Mask out filtered events in PCMEID{0,1}_EL1

As we can now hide events from the guest, let's also adjust its view of
PCMEID{0,1}_EL1 so that it can figure out why some common events are not
counting as they should.

The astute user can still look into the TRM for their CPU and find out
they've been cheated, though. Nobody's perfect.

Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/pmu-emul.c | 29 +++++++++++++++++++++++++++++
 arch/arm64/kvm/sys_regs.c |  5 +----
 include/kvm/arm_pmu.h     |  5 +++++
 3 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 8cd365968af1..ee13c5eecd3d 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -763,6 +763,35 @@ static int kvm_pmu_probe_pmuver(void)
 	return pmuver;
 }
 
+u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
+{
+	unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
+	u64 val, mask = 0;
+	int base, i;
+
+	if (!pmceid1) {
+		val = read_sysreg(pmceid0_el0);
+		base = 0;
+	} else {
+		val = read_sysreg(pmceid1_el0);
+		base = 32;
+	}
+
+	if (!bmap)
+		return val;
+
+	for (i = 0; i < 32; i += 8) {
+		u64 byte;
+
+		byte = bitmap_get_value8(bmap, base + i);
+		mask |= byte << i;
+		byte = bitmap_get_value8(bmap, 0x4000 + base + i);
+		mask |= byte << (32 + i);
+	}
+
+	return val & mask;
+}
+
 bool kvm_arm_support_pmu_v3(void)
 {
 	/*
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 077293b5115f..20ab2a7d37ca 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -769,10 +769,7 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 	if (pmu_access_el0_disabled(vcpu))
 		return false;
 
-	if (!(p->Op2 & 1))
-		pmceid = read_sysreg(pmceid0_el0);
-	else
-		pmceid = read_sysreg(pmceid1_el0);
+	pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
 
 	p->regval = pmceid;
 
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 6db030439e29..98cbfe885a53 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -34,6 +34,7 @@ struct kvm_pmu {
 u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx);
 void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val);
 u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu);
+u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1);
 void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu);
 void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu);
@@ -108,6 +109,10 @@ static inline int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
+static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
+{
+	return 0;
+}
 #endif
 
 #endif

From 8be86a5eec046cb8bd06b4393d373cd433a32cba Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 12 Feb 2020 14:40:24 +0000
Subject: [PATCH 52/87] KVM: arm64: Document PMU filtering API

Add a small blurb describing how the event filtering API gets used.

Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 Documentation/virt/kvm/devices/vcpu.rst | 46 +++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
index ca374d3fe085..c8b707afca6e 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -55,6 +55,52 @@ Request the initialization of the PMUv3.  If using the PMUv3 with an in-kernel
 virtual GIC implementation, this must be done after initializing the in-kernel
 irqchip.
 
+1.3 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_FILTER
+-----------------------------------------
+
+:Parameters: in kvm_device_attr.addr the address for a PMU event filter is a
+             pointer to a struct kvm_pmu_event_filter
+
+:Returns:
+
+	 =======  ======================================================
+	 -ENODEV: PMUv3 not supported or GIC not initialized
+	 -ENXIO:  PMUv3 not properly configured or in-kernel irqchip not
+	 	  configured as required prior to calling this attribute
+	 -EBUSY:  PMUv3 already initialized
+	 -EINVAL: Invalid filter range
+	 =======  ======================================================
+
+Request the installation of a PMU event filter described as follows:
+
+struct kvm_pmu_event_filter {
+	__u16	base_event;
+	__u16	nevents;
+
+#define KVM_PMU_EVENT_ALLOW	0
+#define KVM_PMU_EVENT_DENY	1
+
+	__u8	action;
+	__u8	pad[3];
+};
+
+A filter range is defined as the range [@base_event, @base_event + @nevents),
+together with an @action (KVM_PMU_EVENT_ALLOW or KVM_PMU_EVENT_DENY). The
+first registered range defines the global policy (global ALLOW if the first
+@action is DENY, global DENY if the first @action is ALLOW). Multiple ranges
+can be programmed, and must fit within the event space defined by the PMU
+architecture (10 bits on ARMv8.0, 16 bits from ARMv8.1 onwards).
+
+Note: "Cancelling" a filter by registering the opposite action for the same
+range doesn't change the default action. For example, installing an ALLOW
+filter for event range [0:10) as the first filter and then applying a DENY
+action for the same range will leave the whole range as disabled.
+
+Restrictions: Event 0 (SW_INCR) is never filtered, as it doesn't count a
+hardware event. Filtering event 0x1E (CHAIN) has no effect either, as it
+isn't strictly speaking an event. Filtering the cycle counter is possible
+using event 0x11 (CPU_CYCLES).
+
 
 2. GROUP: KVM_ARM_VCPU_TIMER_CTRL
 =================================

From af130d0adc8e48c73030c0d71a59ce1f7809a8fa Mon Sep 17 00:00:00 2001
From: Alexandru Elisei <alexandru.elisei@arm.com>
Date: Thu, 24 Sep 2020 13:37:30 +0100
Subject: [PATCH 53/87] KVM: arm64: Add undocumented return values for PMU
 device control group

KVM_ARM_VCPU_PMU_V3_IRQ returns -EFAULT if get_user() fails when reading
the interrupt number from kvm_device_attr.addr.

KVM_ARM_VCPU_PMU_V3_INIT returns the error value from kvm_vgic_set_owner().
kvm_arm_pmu_v3_init() checks that the vgic has been initialized and the
interrupt number is valid, but kvm_vgic_set_owner() can still return the
error code -EEXIST if another device has already claimed the interrupt.

Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Link: https://lore.kernel.org/r/20200924123731.268177-2-alexandru.elisei@arm.com
---
 Documentation/virt/kvm/devices/vcpu.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
index c8b707afca6e..1da576a4b767 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -25,6 +25,7 @@ Returns:
 
 	 =======  ========================================================
 	 -EBUSY   The PMU overflow interrupt is already set
+	 -EFAULT  Error reading interrupt number
 	 -ENXIO   The overflow interrupt not set when attempting to get it
 	 -ENODEV  PMUv3 not supported
 	 -EINVAL  Invalid PMU overflow interrupt number supplied or
@@ -45,6 +46,7 @@ all vcpus, while as an SPI it must be a separate number per vcpu.
 Returns:
 
 	 =======  ======================================================
+	 -EEXIST  Interrupt number already used
 	 -ENODEV  PMUv3 not supported or GIC not initialized
 	 -ENXIO   PMUv3 not properly configured or in-kernel irqchip not
 		  configured as required prior to calling this attribute

From 51dd2eb98c72c1ff4cdb1526fff9a259a563c257 Mon Sep 17 00:00:00 2001
From: Alexandru Elisei <alexandru.elisei@arm.com>
Date: Thu, 24 Sep 2020 13:37:31 +0100
Subject: [PATCH 54/87] KVM: arm64: Match PMU error code descriptions with
 error conditions

Update the description of the PMU KVM_{GET, SET}_DEVICE_ATTR error codes
to be a better match for the code that returns them.

Signed-off-by: Alexandru Elisei <alexandru.elisei@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Link: https://lore.kernel.org/r/20200924123731.268177-3-alexandru.elisei@arm.com
---
 Documentation/virt/kvm/devices/vcpu.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
index 1da576a4b767..da7c2ef7dafc 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -26,8 +26,9 @@ Returns:
 	 =======  ========================================================
 	 -EBUSY   The PMU overflow interrupt is already set
 	 -EFAULT  Error reading interrupt number
-	 -ENXIO   The overflow interrupt not set when attempting to get it
-	 -ENODEV  PMUv3 not supported
+	 -ENXIO   PMUv3 not supported or the overflow interrupt not set
+		  when attempting to get it
+	 -ENODEV  KVM_ARM_VCPU_PMU_V3 feature missing from VCPU
 	 -EINVAL  Invalid PMU overflow interrupt number supplied or
 		  trying to set the IRQ number without using an in-kernel
 		  irqchip.
@@ -48,8 +49,8 @@ Returns:
 	 =======  ======================================================
 	 -EEXIST  Interrupt number already used
 	 -ENODEV  PMUv3 not supported or GIC not initialized
-	 -ENXIO   PMUv3 not properly configured or in-kernel irqchip not
-		  configured as required prior to calling this attribute
+	 -ENXIO   PMUv3 not supported, missing VCPU feature or interrupt
+		  number not set
 	 -EBUSY   PMUv3 already initialized
 	 =======  ======================================================
 

From 39533e12063be7f55e3d6ae21ffe067799d542a4 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 16 Jul 2020 17:11:10 +0100
Subject: [PATCH 55/87] arm64: Run ARCH_WORKAROUND_2 enabling code on all CPUs

Commit 606f8e7b27bf ("arm64: capabilities: Use linear array for
detection and verification") changed the way we deal with per-CPU errata
by only calling the .matches() callback until one CPU is found to be
affected. At this point, .matches() stop being called, and .cpu_enable()
will be called on all CPUs.

This breaks the ARCH_WORKAROUND_2 handling, as only a single CPU will be
mitigated.

In order to address this, forcefully call the .matches() callback from a
.cpu_enable() callback, which brings us back to the original behaviour.

Fixes: 606f8e7b27bf ("arm64: capabilities: Use linear array for detection and verification")
Cc: <stable@vger.kernel.org>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/cpu_errata.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 3fe64bf5a58d..abfef5f3b5fd 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -457,6 +457,12 @@ out_printmsg:
 	return required;
 }
 
+static void cpu_enable_ssbd_mitigation(const struct arm64_cpu_capabilities *cap)
+{
+	if (ssbd_state != ARM64_SSBD_FORCE_DISABLE)
+		cap->matches(cap, SCOPE_LOCAL_CPU);
+}
+
 /* known invulnerable cores */
 static const struct midr_range arm64_ssb_cpus[] = {
 	MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
@@ -914,6 +920,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		.capability = ARM64_SSBD,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
 		.matches = has_ssbd_mitigation,
+		.cpu_enable = cpu_enable_ssbd_mitigation,
 		.midr_range_list = arm64_ssb_cpus,
 	},
 #ifdef CONFIG_ARM64_ERRATUM_1418040

From 6e5f0927846adf39aebee450f13871e3cb4ab012 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 22:11:13 +0100
Subject: [PATCH 56/87] arm64: Remove Spectre-related CONFIG_* options

The spectre mitigations are too configurable for their own good, leading
to confusing logic trying to figure out when we should mitigate and when
we shouldn't. Although the plethora of command-line options need to stick
around for backwards compatibility, the default-on CONFIG options that
depend on EXPERT can be dropped, as the mitigations only do anything if
the system is vulnerable, a mitigation is available and the command-line
hasn't disabled it.

Remove CONFIG_HARDEN_BRANCH_PREDICTOR and CONFIG_ARM64_SSBD in favour of
enabling this code unconditionally.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/Kconfig                      | 26 -------------------------
 arch/arm64/include/asm/cpufeature.h     |  4 ----
 arch/arm64/include/asm/kvm_mmu.h        |  7 -------
 arch/arm64/include/asm/mmu.h            |  9 ---------
 arch/arm64/kernel/Makefile              |  3 +--
 arch/arm64/kernel/cpu_errata.c          | 19 ++----------------
 arch/arm64/kernel/cpufeature.c          |  4 ----
 arch/arm64/kernel/entry.S               |  4 ----
 arch/arm64/kvm/Kconfig                  |  2 +-
 arch/arm64/kvm/hyp/hyp-entry.S          |  2 --
 arch/arm64/kvm/hyp/include/hyp/switch.h |  4 ----
 11 files changed, 4 insertions(+), 80 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6d232837cbee..51259274a819 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1165,32 +1165,6 @@ config UNMAP_KERNEL_AT_EL0
 
 	  If unsure, say Y.
 
-config HARDEN_BRANCH_PREDICTOR
-	bool "Harden the branch predictor against aliasing attacks" if EXPERT
-	default y
-	help
-	  Speculation attacks against some high-performance processors rely on
-	  being able to manipulate the branch predictor for a victim context by
-	  executing aliasing branches in the attacker context.  Such attacks
-	  can be partially mitigated against by clearing internal branch
-	  predictor state and limiting the prediction logic in some situations.
-
-	  This config option will take CPU-specific actions to harden the
-	  branch predictor against aliasing attacks and may rely on specific
-	  instruction sequences or control bits being set by the system
-	  firmware.
-
-	  If unsure, say Y.
-
-config ARM64_SSBD
-	bool "Speculative Store Bypass Disable" if EXPERT
-	default y
-	help
-	  This enables mitigation of the bypassing of previous stores
-	  by speculative loads.
-
-	  If unsure, say Y.
-
 config RODATA_FULL_DEFAULT_ENABLED
 	bool "Apply r/o permissions of VM areas also to their linear aliases"
 	default y
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 89b4f0142c28..851d144527ed 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -712,12 +712,8 @@ int get_spectre_v2_workaround_state(void);
 
 static inline int arm64_get_ssbd_state(void)
 {
-#ifdef CONFIG_ARM64_SSBD
 	extern int ssbd_state;
 	return ssbd_state;
-#else
-	return ARM64_SSBD_UNKNOWN;
-#endif
 }
 
 void arm64_set_ssbd_mitigation(bool state);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 189839c3706a..1df85a3ddb9b 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -527,7 +527,6 @@ static inline int kvm_map_vectors(void)
 }
 #endif
 
-#ifdef CONFIG_ARM64_SSBD
 DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
 
 static inline int hyp_map_aux_data(void)
@@ -544,12 +543,6 @@ static inline int hyp_map_aux_data(void)
 	}
 	return 0;
 }
-#else
-static inline int hyp_map_aux_data(void)
-{
-	return 0;
-}
-#endif
 
 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
 
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index a7a5ecaa2e83..f5e3efeb5b97 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -45,7 +45,6 @@ struct bp_hardening_data {
 	bp_hardening_cb_t	fn;
 };
 
-#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
 DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
 
 static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
@@ -64,14 +63,6 @@ static inline void arm64_apply_bp_hardening(void)
 	if (d->fn)
 		d->fn();
 }
-#else
-static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
-{
-	return NULL;
-}
-
-static inline void arm64_apply_bp_hardening(void)	{ }
-#endif	/* CONFIG_HARDEN_BRANCH_PREDICTOR */
 
 extern void arm64_memblock_init(void);
 extern void paging_init(void);
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a561cbb91d4d..ed8799bdd41f 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o
+			   ssbd.o syscall.o
 
 targets			+= efi-entry.o
 
@@ -59,7 +59,6 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
 obj-$(CONFIG_CRASH_DUMP)		+= crash_dump.o
 obj-$(CONFIG_CRASH_CORE)		+= crash_core.o
 obj-$(CONFIG_ARM_SDE_INTERFACE)		+= sdei.o
-obj-$(CONFIG_ARM64_SSBD)		+= ssbd.o
 obj-$(CONFIG_ARM64_PTR_AUTH)		+= pointer_auth.o
 obj-$(CONFIG_SHADOW_CALL_STACK)		+= scs.o
 
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index abfef5f3b5fd..dd9103915f1e 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -254,9 +254,7 @@ static int detect_harden_bp_fw(void)
 	    ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1))
 		cb = qcom_link_stack_sanitization;
 
-	if (IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR))
-		install_bp_hardening_cb(cb, smccc_start, smccc_end);
-
+	install_bp_hardening_cb(cb, smccc_start, smccc_end);
 	return 1;
 }
 
@@ -335,11 +333,6 @@ void arm64_set_ssbd_mitigation(bool state)
 {
 	int conduit;
 
-	if (!IS_ENABLED(CONFIG_ARM64_SSBD)) {
-		pr_info_once("SSBD disabled by kernel configuration\n");
-		return;
-	}
-
 	if (this_cpu_has_cap(ARM64_SSBS)) {
 		if (state)
 			asm volatile(SET_PSTATE_SSBS(0));
@@ -584,12 +577,6 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
 
 	__spectrev2_safe = false;
 
-	if (!IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) {
-		pr_warn_once("spectrev2 mitigation disabled by kernel configuration\n");
-		__hardenbp_enab = false;
-		return false;
-	}
-
 	/* forced off */
 	if (__nospectre_v2 || cpu_mitigations_off()) {
 		pr_info_once("spectrev2 mitigation disabled by command line option\n");
@@ -1004,9 +991,7 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev,
 	switch (ssbd_state) {
 	case ARM64_SSBD_KERNEL:
 	case ARM64_SSBD_FORCE_ENABLE:
-		if (IS_ENABLED(CONFIG_ARM64_SSBD))
-			return sprintf(buf,
-			    "Mitigation: Speculative Store Bypass disabled via prctl\n");
+		return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
 	}
 
 	return sprintf(buf, "Vulnerable\n");
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6424584be01e..4bb45b1d4ae4 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1583,7 +1583,6 @@ static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
 	WARN_ON(val & (7 << 27 | 7 << 21));
 }
 
-#ifdef CONFIG_ARM64_SSBD
 static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
 {
 	if (user_mode(regs))
@@ -1623,7 +1622,6 @@ static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused)
 		arm64_set_ssbd_mitigation(true);
 	}
 }
-#endif /* CONFIG_ARM64_SSBD */
 
 #ifdef CONFIG_ARM64_PAN
 static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
@@ -1976,7 +1974,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.field_pos = ID_AA64ISAR0_CRC32_SHIFT,
 		.min_field_value = 1,
 	},
-#ifdef CONFIG_ARM64_SSBD
 	{
 		.desc = "Speculative Store Bypassing Safe (SSBS)",
 		.capability = ARM64_SSBS,
@@ -1988,7 +1985,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY,
 		.cpu_enable = cpu_enable_ssbs,
 	},
-#endif
 #ifdef CONFIG_ARM64_CNP
 	{
 		.desc = "Common not Private translations",
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 55af8b504b65..81b709349d7b 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -132,7 +132,6 @@ alternative_else_nop_endif
 	 * them if required.
 	 */
 	.macro	apply_ssbd, state, tmp1, tmp2
-#ifdef CONFIG_ARM64_SSBD
 alternative_cb	arm64_enable_wa2_handling
 	b	.L__asm_ssbd_skip\@
 alternative_cb_end
@@ -146,7 +145,6 @@ alternative_cb	arm64_update_smccc_conduit
 	nop					// Patched to SMC/HVC #0
 alternative_cb_end
 .L__asm_ssbd_skip\@:
-#endif
 	.endm
 
 	.macro	kernel_entry, el, regsize = 64
@@ -697,11 +695,9 @@ el0_irq_naked:
 	bl	trace_hardirqs_off
 #endif
 
-#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
 	tbz	x22, #55, 1f
 	bl	do_el0_irq_bp_hardening
 1:
-#endif
 	irq_handler
 
 #ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 318c8f2df245..42e5895763b3 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -58,7 +58,7 @@ config KVM_ARM_PMU
 	  virtual machines.
 
 config KVM_INDIRECT_VECTORS
-	def_bool HARDEN_BRANCH_PREDICTOR || RANDOMIZE_BASE
+	def_bool RANDOMIZE_BASE
 
 endif # KVM
 
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 46b4dab933d0..41698bae5d5d 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -116,7 +116,6 @@ el1_hvc_guest:
 			  ARM_SMCCC_ARCH_WORKAROUND_2)
 	cbnz	w1, el1_trap
 
-#ifdef CONFIG_ARM64_SSBD
 alternative_cb	arm64_enable_wa2_handling
 	b	wa2_end
 alternative_cb_end
@@ -143,7 +142,6 @@ alternative_cb_end
 wa2_end:
 	mov	x2, xzr
 	mov	x1, xzr
-#endif
 
 wa_epilogue:
 	mov	x0, xzr
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 5b6b8fa00f0a..b503f19c37c5 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -489,7 +489,6 @@ static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu)
 
 static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
 {
-#ifdef CONFIG_ARM64_SSBD
 	/*
 	 * The host runs with the workaround always present. If the
 	 * guest wants it disabled, so be it...
@@ -497,19 +496,16 @@ static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
 	if (__needs_ssbd_off(vcpu) &&
 	    __hyp_this_cpu_read(arm64_ssbd_callback_required))
 		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL);
-#endif
 }
 
 static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
 {
-#ifdef CONFIG_ARM64_SSBD
 	/*
 	 * If the guest has disabled the workaround, bring it back on.
 	 */
 	if (__needs_ssbd_off(vcpu) &&
 	    __hyp_this_cpu_read(arm64_ssbd_callback_required))
 		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL);
-#endif
 }
 
 static inline void __kvm_unexpected_el2_exception(void)

From 5359a87d5bdacb0c27f282f40ccbd80ce8f9702a Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 22:16:10 +0100
Subject: [PATCH 57/87] KVM: arm64: Replace CONFIG_KVM_INDIRECT_VECTORS with
 CONFIG_RANDOMIZE_BASE

The removal of CONFIG_HARDEN_BRANCH_PREDICTOR means that
CONFIG_KVM_INDIRECT_VECTORS is synonymous with CONFIG_RANDOMIZE_BASE,
so replace it.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kvm_asm.h | 2 +-
 arch/arm64/include/asm/kvm_mmu.h | 2 +-
 arch/arm64/kernel/cpu_errata.c   | 4 ++--
 arch/arm64/kvm/Kconfig           | 3 ---
 arch/arm64/kvm/hyp/Makefile      | 2 +-
 arch/arm64/kvm/hyp/hyp-entry.S   | 2 +-
 6 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 6f98fbd0ac81..e9378cc8049d 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -102,7 +102,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
 #define __kvm_hyp_init		CHOOSE_NVHE_SYM(__kvm_hyp_init)
 #define __kvm_hyp_vector	CHOOSE_HYP_SYM(__kvm_hyp_vector)
 
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
+#ifdef CONFIG_RANDOMIZE_BASE
 extern atomic_t arm64_el2_vector_last_slot;
 DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
 #define __bp_harden_hyp_vecs	CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 1df85a3ddb9b..dfd176b0642e 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -430,7 +430,7 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
 	return ret;
 }
 
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
+#ifdef CONFIG_RANDOMIZE_BASE
 /*
  * EL2 vectors can be mapped and rerouted in a number of ways,
  * depending on the kernel configuration and CPU present:
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index dd9103915f1e..135bf7f92d4a 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -113,7 +113,7 @@ atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
 
 DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
 
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
+#ifdef CONFIG_RANDOMIZE_BASE
 static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
 				const char *hyp_vecs_end)
 {
@@ -167,7 +167,7 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn,
 {
 	__this_cpu_write(bp_hardening_data.fn, fn);
 }
-#endif	/* CONFIG_KVM_INDIRECT_VECTORS */
+#endif	/* CONFIG_RANDOMIZE_BASE */
 
 #include <linux/arm-smccc.h>
 
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 42e5895763b3..043756db8f6e 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -57,9 +57,6 @@ config KVM_ARM_PMU
 	  Adds support for a virtual Performance Monitoring Unit (PMU) in
 	  virtual machines.
 
-config KVM_INDIRECT_VECTORS
-	def_bool RANDOMIZE_BASE
-
 endif # KVM
 
 endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index f54f0e89a71c..89d2bf73acb5 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -11,4 +11,4 @@ subdir-ccflags-y := -I$(incdir)				\
 		    $(DISABLE_STACKLEAK_PLUGIN)
 
 obj-$(CONFIG_KVM) += vhe/ nvhe/
-obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o
+obj-$(CONFIG_RANDOMIZE_BASE) += smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 41698bae5d5d..db2dd7500617 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -286,7 +286,7 @@ SYM_CODE_START(__kvm_hyp_vector)
 	valid_vect	el1_error		// Error 32-bit EL1
 SYM_CODE_END(__kvm_hyp_vector)
 
-#ifdef CONFIG_KVM_INDIRECT_VECTORS
+#ifdef CONFIG_RANDOMIZE_BASE
 .macro hyp_ventry
 	.align 7
 1:	esb

From b181048f414668933c524fe077b9aa6e8e9a42b4 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 22:42:07 +0100
Subject: [PATCH 58/87] KVM: arm64: Simplify install_bp_hardening_cb()

Use is_hyp_mode_available() to detect whether or not we need to patch
the KVM vectors for branch hardening, which avoids the need to take the
vector pointers as parameters.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/cpu_errata.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 135bf7f92d4a..a72ca57f5630 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -126,18 +126,19 @@ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
 	__flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
 }
 
-static void install_bp_hardening_cb(bp_hardening_cb_t fn,
-				    const char *hyp_vecs_start,
-				    const char *hyp_vecs_end)
+static void install_bp_hardening_cb(bp_hardening_cb_t fn)
 {
 	static DEFINE_RAW_SPINLOCK(bp_lock);
 	int cpu, slot = -1;
+	const char *hyp_vecs_start = __smccc_workaround_1_smc;
+	const char *hyp_vecs_end = __smccc_workaround_1_smc +
+				   __SMCCC_WORKAROUND_1_SMC_SZ;
 
 	/*
 	 * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
 	 * we're a guest. Skip the hyp-vectors work.
 	 */
-	if (!hyp_vecs_start) {
+	if (!is_hyp_mode_available()) {
 		__this_cpu_write(bp_hardening_data.fn, fn);
 		return;
 	}
@@ -161,9 +162,7 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn,
 	raw_spin_unlock(&bp_lock);
 }
 #else
-static void install_bp_hardening_cb(bp_hardening_cb_t fn,
-				      const char *hyp_vecs_start,
-				      const char *hyp_vecs_end)
+static void install_bp_hardening_cb(bp_hardening_cb_t fn)
 {
 	__this_cpu_write(bp_hardening_data.fn, fn);
 }
@@ -209,7 +208,6 @@ early_param("nospectre_v2", parse_nospectre_v2);
 static int detect_harden_bp_fw(void)
 {
 	bp_hardening_cb_t cb;
-	void *smccc_start, *smccc_end;
 	struct arm_smccc_res res;
 	u32 midr = read_cpuid_id();
 
@@ -229,21 +227,10 @@ static int detect_harden_bp_fw(void)
 	switch (arm_smccc_1_1_get_conduit()) {
 	case SMCCC_CONDUIT_HVC:
 		cb = call_hvc_arch_workaround_1;
-		/* This is a guest, no need to patch KVM vectors */
-		smccc_start = NULL;
-		smccc_end = NULL;
 		break;
 
 	case SMCCC_CONDUIT_SMC:
 		cb = call_smc_arch_workaround_1;
-#if IS_ENABLED(CONFIG_KVM)
-		smccc_start = __smccc_workaround_1_smc;
-		smccc_end = __smccc_workaround_1_smc +
-			__SMCCC_WORKAROUND_1_SMC_SZ;
-#else
-		smccc_start = NULL;
-		smccc_end = NULL;
-#endif
 		break;
 
 	default:
@@ -254,7 +241,7 @@ static int detect_harden_bp_fw(void)
 	    ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1))
 		cb = qcom_link_stack_sanitization;
 
-	install_bp_hardening_cb(cb, smccc_start, smccc_end);
+	install_bp_hardening_cb(cb);
 	return 1;
 }
 

From 688f1e4b6d8f792c44bec3a448664fd9e8949964 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 23:00:31 +0100
Subject: [PATCH 59/87] arm64: Rename ARM64_HARDEN_BRANCH_PREDICTOR to
 ARM64_SPECTRE_V2

For better or worse, the world knows about "Spectre" and not about
"Branch predictor hardening". Rename ARM64_HARDEN_BRANCH_PREDICTOR to
ARM64_SPECTRE_V2 as part of moving all of the Spectre mitigations into
their own little corner.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cpucaps.h |  2 +-
 arch/arm64/include/asm/kvm_mmu.h | 27 +++++++++++++--------------
 arch/arm64/include/asm/mmu.h     |  2 +-
 arch/arm64/kernel/cpu_errata.c   |  2 +-
 4 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 07b643a70710..348bfcf6c818 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -31,7 +31,7 @@
 #define ARM64_HAS_DCPOP				21
 #define ARM64_SVE				22
 #define ARM64_UNMAP_KERNEL_AT_EL0		23
-#define ARM64_HARDEN_BRANCH_PREDICTOR		24
+#define ARM64_SPECTRE_V2			24
 #define ARM64_HAS_RAS_EXTN			25
 #define ARM64_WORKAROUND_843419			26
 #define ARM64_HAS_CACHE_IDC			27
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index dfd176b0642e..873e12430ac7 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -435,14 +435,13 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
  * EL2 vectors can be mapped and rerouted in a number of ways,
  * depending on the kernel configuration and CPU present:
  *
- * - If the CPU has the ARM64_HARDEN_BRANCH_PREDICTOR cap, the
- *   hardening sequence is placed in one of the vector slots, which is
- *   executed before jumping to the real vectors.
+ * - If the CPU is affected by Spectre-v2, the hardening sequence is
+ *   placed in one of the vector slots, which is executed before jumping
+ *   to the real vectors.
  *
- * - If the CPU has both the ARM64_HARDEN_EL2_VECTORS cap and the
- *   ARM64_HARDEN_BRANCH_PREDICTOR cap, the slot containing the
- *   hardening sequence is mapped next to the idmap page, and executed
- *   before jumping to the real vectors.
+ * - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot
+ *   containing the hardening sequence is mapped next to the idmap page,
+ *   and executed before jumping to the real vectors.
  *
  * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an
  *   empty slot is selected, mapped next to the idmap page, and
@@ -464,7 +463,7 @@ static inline void *kvm_get_hyp_vector(void)
 	void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
 	int slot = -1;
 
-	if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) {
+	if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) {
 		vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
 		slot = data->hyp_vectors_slot;
 	}
@@ -485,15 +484,15 @@ static inline void *kvm_get_hyp_vector(void)
 static inline int kvm_map_vectors(void)
 {
 	/*
-	 * HBP  = ARM64_HARDEN_BRANCH_PREDICTOR
+	 * SV2  = ARM64_SPECTRE_V2
 	 * HEL2 = ARM64_HARDEN_EL2_VECTORS
 	 *
-	 * !HBP + !HEL2 -> use direct vectors
-	 *  HBP + !HEL2 -> use hardened vectors in place
-	 * !HBP +  HEL2 -> allocate one vector slot and use exec mapping
-	 *  HBP +  HEL2 -> use hardened vertors and use exec mapping
+	 * !SV2 + !HEL2 -> use direct vectors
+	 *  SV2 + !HEL2 -> use hardened vectors in place
+	 * !SV2 +  HEL2 -> allocate one vector slot and use exec mapping
+	 *  SV2 +  HEL2 -> use hardened vertors and use exec mapping
 	 */
-	if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) {
+	if (cpus_have_const_cap(ARM64_SPECTRE_V2)) {
 		__kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
 		__kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
 	}
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index f5e3efeb5b97..cbff2d42c1d8 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -56,7 +56,7 @@ static inline void arm64_apply_bp_hardening(void)
 {
 	struct bp_hardening_data *d;
 
-	if (!cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR))
+	if (!cpus_have_const_cap(ARM64_SPECTRE_V2))
 		return;
 
 	d = arm64_get_bp_hardening_data();
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index a72ca57f5630..b275f2d5e7a3 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -877,7 +877,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 #endif
 	{
 		.desc = "Branch predictor hardening",
-		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+		.capability = ARM64_SPECTRE_V2,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
 		.matches = check_branch_predictor,
 		.cpu_enable = cpu_enable_branch_predictor_hardening,

From 455697adefdb8604cd10413da37c60014aecbbb7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 23:10:49 +0100
Subject: [PATCH 60/87] arm64: Introduce separate file for spectre mitigations
 and reporting

The spectre mitigation code is spread over a few different files, which
makes it both hard to follow, but also hard to remove it should we want
to do that in future.

Introduce a new file for housing the spectre mitigations, and populate
it with the spectre-v1 reporting code to start with.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/Makefile      |  2 +-
 arch/arm64/kernel/cpu_errata.c  |  6 ------
 arch/arm64/kernel/proton-pack.c | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 7 deletions(-)
 create mode 100644 arch/arm64/kernel/proton-pack.c

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index ed8799bdd41f..15b0fcbb6883 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   ssbd.o syscall.o
+			   ssbd.o syscall.o proton-pack.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index b275f2d5e7a3..5eb9a9126dc4 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -949,12 +949,6 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	}
 };
 
-ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
-}
-
 ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
new file mode 100644
index 000000000000..c37bf468e4a4
--- /dev/null
+++ b/arch/arm64/kernel/proton-pack.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handle detection, reporting and mitigation of Spectre v1, v2 and v4, as
+ * detailed at:
+ *
+ *   https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability
+ *
+ * This code was originally written hastily under an awful lot of stress and so
+ * aspects of it are somewhat hacky. Unfortunately, changing anything in here
+ * instantly makes me feel ill. Thanks, Jann. Thann.
+ *
+ * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
+ * Copyright (C) 2020 Google LLC
+ *
+ * "If there's something strange in your neighbourhood, who you gonna call?"
+ *
+ * Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org>
+ */
+
+#include <linux/device.h>
+
+/*
+ * Spectre v1.
+ *
+ * The kernel can't protect userspace for this one: it's each person for
+ * themselves. Advertise what we're doing and be done with it.
+ */
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+}

From d4647f0a2ad711101067cba69c34716758aa1e48 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 23:30:17 +0100
Subject: [PATCH 61/87] arm64: Rewrite Spectre-v2 mitigation code

The Spectre-v2 mitigation code is pretty unwieldy and hard to maintain.
This is largely due to it being written hastily, without much clue as to
how things would pan out, and also because it ends up mixing policy and
state in such a way that it is very difficult to figure out what's going
on.

Rewrite the Spectre-v2 mitigation so that it clearly separates state from
policy and follows a more structured approach to handling the mitigation.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cpufeature.h |   6 -
 arch/arm64/include/asm/kvm_host.h   |  17 --
 arch/arm64/include/asm/processor.h  |   1 +
 arch/arm64/include/asm/spectre.h    |  27 +++
 arch/arm64/kernel/cpu_errata.c      | 236 +----------------------
 arch/arm64/kernel/proton-pack.c     | 288 ++++++++++++++++++++++++++++
 arch/arm64/kvm/hypercalls.c         |   8 +-
 arch/arm64/kvm/psci.c               |   8 +-
 8 files changed, 327 insertions(+), 264 deletions(-)
 create mode 100644 arch/arm64/include/asm/spectre.h

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 851d144527ed..3b48aa121cee 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -698,12 +698,6 @@ static inline bool system_supports_tlb_range(void)
 		cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
 }
 
-#define ARM64_BP_HARDEN_UNKNOWN		-1
-#define ARM64_BP_HARDEN_WA_NEEDED	0
-#define ARM64_BP_HARDEN_NOT_REQUIRED	1
-
-int get_spectre_v2_workaround_state(void);
-
 #define ARM64_SSBD_UNKNOWN		-1
 #define ARM64_SSBD_FORCE_DISABLE	0
 #define ARM64_SSBD_KERNEL		1
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e52c927aade5..59f6cff0f0cb 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -631,23 +631,6 @@ static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
 static inline void kvm_clr_pmu_events(u32 clr) {}
 #endif
 
-#define KVM_BP_HARDEN_UNKNOWN		-1
-#define KVM_BP_HARDEN_WA_NEEDED		0
-#define KVM_BP_HARDEN_NOT_REQUIRED	1
-
-static inline int kvm_arm_harden_branch_predictor(void)
-{
-	switch (get_spectre_v2_workaround_state()) {
-	case ARM64_BP_HARDEN_WA_NEEDED:
-		return KVM_BP_HARDEN_WA_NEEDED;
-	case ARM64_BP_HARDEN_NOT_REQUIRED:
-		return KVM_BP_HARDEN_NOT_REQUIRED;
-	case ARM64_BP_HARDEN_UNKNOWN:
-	default:
-		return KVM_BP_HARDEN_UNKNOWN;
-	}
-}
-
 #define KVM_SSBD_UNKNOWN		-1
 #define KVM_SSBD_FORCE_DISABLE		0
 #define KVM_SSBD_KERNEL		1
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 240fe5e5b720..436ab1549ec6 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -38,6 +38,7 @@
 #include <asm/pgtable-hwdef.h>
 #include <asm/pointer_auth.h>
 #include <asm/ptrace.h>
+#include <asm/spectre.h>
 #include <asm/types.h>
 
 /*
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
new file mode 100644
index 000000000000..b776abe28dff
--- /dev/null
+++ b/arch/arm64/include/asm/spectre.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Interface for managing mitigations for Spectre vulnerabilities.
+ *
+ * Copyright (C) 2020 Google LLC
+ * Author: Will Deacon <will@kernel.org>
+ */
+
+#ifndef __ASM_SPECTRE_H
+#define __ASM_SPECTRE_H
+
+#include <asm/cpufeature.h>
+
+/* Watch out, ordering is important here. */
+enum mitigation_state {
+	SPECTRE_UNAFFECTED,
+	SPECTRE_MITIGATED,
+	SPECTRE_VULNERABLE,
+};
+
+struct task_struct;
+
+enum mitigation_state arm64_get_spectre_v2_state(void);
+bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope);
+void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
+
+#endif	/* __ASM_SPECTRE_H */
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 5eb9a9126dc4..4103391f51b5 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -106,145 +106,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap)
 		sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
 }
 
-atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-
-DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
-
-#ifdef CONFIG_RANDOMIZE_BASE
-static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
-				const char *hyp_vecs_end)
-{
-	void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
-	int i;
-
-	for (i = 0; i < SZ_2K; i += 0x80)
-		memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
-
-	__flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
-}
-
-static void install_bp_hardening_cb(bp_hardening_cb_t fn)
-{
-	static DEFINE_RAW_SPINLOCK(bp_lock);
-	int cpu, slot = -1;
-	const char *hyp_vecs_start = __smccc_workaround_1_smc;
-	const char *hyp_vecs_end = __smccc_workaround_1_smc +
-				   __SMCCC_WORKAROUND_1_SMC_SZ;
-
-	/*
-	 * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
-	 * we're a guest. Skip the hyp-vectors work.
-	 */
-	if (!is_hyp_mode_available()) {
-		__this_cpu_write(bp_hardening_data.fn, fn);
-		return;
-	}
-
-	raw_spin_lock(&bp_lock);
-	for_each_possible_cpu(cpu) {
-		if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
-			slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
-			break;
-		}
-	}
-
-	if (slot == -1) {
-		slot = atomic_inc_return(&arm64_el2_vector_last_slot);
-		BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
-		__copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
-	}
-
-	__this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
-	__this_cpu_write(bp_hardening_data.fn, fn);
-	raw_spin_unlock(&bp_lock);
-}
-#else
-static void install_bp_hardening_cb(bp_hardening_cb_t fn)
-{
-	__this_cpu_write(bp_hardening_data.fn, fn);
-}
-#endif	/* CONFIG_RANDOMIZE_BASE */
-
-#include <linux/arm-smccc.h>
-
-static void __maybe_unused call_smc_arch_workaround_1(void)
-{
-	arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
-}
-
-static void call_hvc_arch_workaround_1(void)
-{
-	arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
-}
-
-static void qcom_link_stack_sanitization(void)
-{
-	u64 tmp;
-
-	asm volatile("mov	%0, x30		\n"
-		     ".rept	16		\n"
-		     "bl	. + 4		\n"
-		     ".endr			\n"
-		     "mov	x30, %0		\n"
-		     : "=&r" (tmp));
-}
-
-static bool __nospectre_v2;
-static int __init parse_nospectre_v2(char *str)
-{
-	__nospectre_v2 = true;
-	return 0;
-}
-early_param("nospectre_v2", parse_nospectre_v2);
-
-/*
- * -1: No workaround
- *  0: No workaround required
- *  1: Workaround installed
- */
-static int detect_harden_bp_fw(void)
-{
-	bp_hardening_cb_t cb;
-	struct arm_smccc_res res;
-	u32 midr = read_cpuid_id();
-
-	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
-			     ARM_SMCCC_ARCH_WORKAROUND_1, &res);
-
-	switch ((int)res.a0) {
-	case 1:
-		/* Firmware says we're just fine */
-		return 0;
-	case 0:
-		break;
-	default:
-		return -1;
-	}
-
-	switch (arm_smccc_1_1_get_conduit()) {
-	case SMCCC_CONDUIT_HVC:
-		cb = call_hvc_arch_workaround_1;
-		break;
-
-	case SMCCC_CONDUIT_SMC:
-		cb = call_smc_arch_workaround_1;
-		break;
-
-	default:
-		return -1;
-	}
-
-	if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) ||
-	    ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1))
-		cb = qcom_link_stack_sanitization;
-
-	install_bp_hardening_cb(cb);
-	return 1;
-}
-
 DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
 
 int ssbd_state __read_mostly = ARM64_SSBD_KERNEL;
@@ -508,83 +369,6 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
 	.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,			\
 	CAP_MIDR_RANGE_LIST(midr_list)
 
-/* Track overall mitigation state. We are only mitigated if all cores are ok */
-static bool __hardenbp_enab = true;
-static bool __spectrev2_safe = true;
-
-int get_spectre_v2_workaround_state(void)
-{
-	if (__spectrev2_safe)
-		return ARM64_BP_HARDEN_NOT_REQUIRED;
-
-	if (!__hardenbp_enab)
-		return ARM64_BP_HARDEN_UNKNOWN;
-
-	return ARM64_BP_HARDEN_WA_NEEDED;
-}
-
-/*
- * List of CPUs that do not need any Spectre-v2 mitigation at all.
- */
-static const struct midr_range spectre_v2_safe_list[] = {
-	MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
-	MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
-	MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
-	MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
-	MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
-	MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
-	MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
-	{ /* sentinel */ }
-};
-
-/*
- * Track overall bp hardening for all heterogeneous cores in the machine.
- * We are only considered "safe" if all booted cores are known safe.
- */
-static bool __maybe_unused
-check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
-{
-	int need_wa;
-
-	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
-
-	/* If the CPU has CSV2 set, we're safe */
-	if (cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64PFR0_EL1),
-						 ID_AA64PFR0_CSV2_SHIFT))
-		return false;
-
-	/* Alternatively, we have a list of unaffected CPUs */
-	if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
-		return false;
-
-	/* Fallback to firmware detection */
-	need_wa = detect_harden_bp_fw();
-	if (!need_wa)
-		return false;
-
-	__spectrev2_safe = false;
-
-	/* forced off */
-	if (__nospectre_v2 || cpu_mitigations_off()) {
-		pr_info_once("spectrev2 mitigation disabled by command line option\n");
-		__hardenbp_enab = false;
-		return false;
-	}
-
-	if (need_wa < 0) {
-		pr_warn_once("ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware\n");
-		__hardenbp_enab = false;
-	}
-
-	return (need_wa > 0);
-}
-
-static void
-cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap)
-{
-	cap->matches(cap, SCOPE_LOCAL_CPU);
-}
-
 static const __maybe_unused struct midr_range tx2_family_cpus[] = {
 	MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
 	MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
@@ -876,11 +660,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	},
 #endif
 	{
-		.desc = "Branch predictor hardening",
+		.desc = "Spectre-v2",
 		.capability = ARM64_SPECTRE_V2,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
-		.matches = check_branch_predictor,
-		.cpu_enable = cpu_enable_branch_predictor_hardening,
+		.matches = has_spectre_v2,
+		.cpu_enable = spectre_v2_enable_mitigation,
 	},
 #ifdef CONFIG_RANDOMIZE_BASE
 	{
@@ -949,20 +733,6 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	}
 };
 
-ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
-		char *buf)
-{
-	switch (get_spectre_v2_workaround_state()) {
-	case ARM64_BP_HARDEN_NOT_REQUIRED:
-		return sprintf(buf, "Not affected\n");
-        case ARM64_BP_HARDEN_WA_NEEDED:
-		return sprintf(buf, "Mitigation: Branch predictor hardening\n");
-        case ARM64_BP_HARDEN_UNKNOWN:
-	default:
-		return sprintf(buf, "Vulnerable\n");
-	}
-}
-
 ssize_t cpu_show_spec_store_bypass(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index c37bf468e4a4..4f1411b45301 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -17,7 +17,33 @@
  * Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org>
  */
 
+#include <linux/arm-smccc.h>
+#include <linux/cpu.h>
 #include <linux/device.h>
+#include <linux/prctl.h>
+
+#include <asm/spectre.h>
+#include <asm/traps.h>
+
+/*
+ * We try to ensure that the mitigation state can never change as the result of
+ * onlining a late CPU.
+ */
+static void update_mitigation_state(enum mitigation_state *oldp,
+				    enum mitigation_state new)
+{
+	enum mitigation_state state;
+
+	do {
+		state = READ_ONCE(*oldp);
+		if (new <= state)
+			break;
+
+		/* Userspace almost certainly can't deal with this. */
+		if (WARN_ON(system_capabilities_finalized()))
+			break;
+	} while (cmpxchg_relaxed(oldp, state, new) != state);
+}
 
 /*
  * Spectre v1.
@@ -30,3 +56,265 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
 {
 	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
 }
+
+/*
+ * Spectre v2.
+ *
+ * This one sucks. A CPU is either:
+ *
+ * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2.
+ * - Mitigated in hardware and listed in our "safe list".
+ * - Mitigated in software by firmware.
+ * - Mitigated in software by a CPU-specific dance in the kernel.
+ * - Vulnerable.
+ *
+ * It's not unlikely for different CPUs in a big.LITTLE system to fall into
+ * different camps.
+ */
+static enum mitigation_state spectre_v2_state;
+
+static bool __read_mostly __nospectre_v2;
+static int __init parse_spectre_v2_param(char *str)
+{
+	__nospectre_v2 = true;
+	return 0;
+}
+early_param("nospectre_v2", parse_spectre_v2_param);
+
+static bool spectre_v2_mitigations_off(void)
+{
+	bool ret = __nospectre_v2 || cpu_mitigations_off();
+
+	if (ret)
+		pr_info_once("spectre-v2 mitigation disabled by command line option\n");
+
+	return ret;
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	switch (spectre_v2_state) {
+	case SPECTRE_UNAFFECTED:
+		return sprintf(buf, "Not affected\n");
+	case SPECTRE_MITIGATED:
+		return sprintf(buf, "Mitigation: Branch predictor hardening\n");
+	case SPECTRE_VULNERABLE:
+		fallthrough;
+	default:
+		return sprintf(buf, "Vulnerable\n");
+	}
+}
+
+static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void)
+{
+	u64 pfr0;
+	static const struct midr_range spectre_v2_safe_list[] = {
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
+		MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
+		MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
+		MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
+		MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
+		{ /* sentinel */ }
+	};
+
+	/* If the CPU has CSV2 set, we're safe */
+	pfr0 = read_cpuid(ID_AA64PFR0_EL1);
+	if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT))
+		return SPECTRE_UNAFFECTED;
+
+	/* Alternatively, we have a list of unaffected CPUs */
+	if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list))
+		return SPECTRE_UNAFFECTED;
+
+	return SPECTRE_VULNERABLE;
+}
+
+#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED	(1)
+
+static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void)
+{
+	int ret;
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+			     ARM_SMCCC_ARCH_WORKAROUND_1, &res);
+
+	ret = res.a0;
+	switch (ret) {
+	case SMCCC_RET_SUCCESS:
+		return SPECTRE_MITIGATED;
+	case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
+		return SPECTRE_UNAFFECTED;
+	default:
+		fallthrough;
+	case SMCCC_RET_NOT_SUPPORTED:
+		return SPECTRE_VULNERABLE;
+	}
+}
+
+bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope)
+{
+	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+
+	if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED)
+		return false;
+
+	if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED)
+		return false;
+
+	return true;
+}
+
+DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
+
+enum mitigation_state arm64_get_spectre_v2_state(void)
+{
+	return spectre_v2_state;
+}
+
+#ifdef CONFIG_KVM
+#ifdef CONFIG_RANDOMIZE_BASE
+#include <asm/cacheflush.h>
+#include <asm/kvm_asm.h>
+
+atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1);
+
+static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
+				const char *hyp_vecs_end)
+{
+	void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K);
+	int i;
+
+	for (i = 0; i < SZ_2K; i += 0x80)
+		memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
+
+	__flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
+}
+
+static void install_bp_hardening_cb(bp_hardening_cb_t fn)
+{
+	static DEFINE_RAW_SPINLOCK(bp_lock);
+	int cpu, slot = -1;
+	const char *hyp_vecs_start = __smccc_workaround_1_smc;
+	const char *hyp_vecs_end = __smccc_workaround_1_smc +
+				   __SMCCC_WORKAROUND_1_SMC_SZ;
+
+	/*
+	 * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if
+	 * we're a guest. Skip the hyp-vectors work.
+	 */
+	if (!is_hyp_mode_available()) {
+		__this_cpu_write(bp_hardening_data.fn, fn);
+		return;
+	}
+
+	raw_spin_lock(&bp_lock);
+	for_each_possible_cpu(cpu) {
+		if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
+			slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
+			break;
+		}
+	}
+
+	if (slot == -1) {
+		slot = atomic_inc_return(&arm64_el2_vector_last_slot);
+		BUG_ON(slot >= BP_HARDEN_EL2_SLOTS);
+		__copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
+	}
+
+	__this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
+	__this_cpu_write(bp_hardening_data.fn, fn);
+	raw_spin_unlock(&bp_lock);
+}
+#else
+static void install_bp_hardening_cb(bp_hardening_cb_t fn)
+{
+	__this_cpu_write(bp_hardening_data.fn, fn);
+}
+#endif	/* CONFIG_RANDOMIZE_BASE */
+#endif	/* CONFIG_KVM */
+
+static void call_smc_arch_workaround_1(void)
+{
+	arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
+}
+
+static void call_hvc_arch_workaround_1(void)
+{
+	arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
+}
+
+static void qcom_link_stack_sanitisation(void)
+{
+	u64 tmp;
+
+	asm volatile("mov	%0, x30		\n"
+		     ".rept	16		\n"
+		     "bl	. + 4		\n"
+		     ".endr			\n"
+		     "mov	x30, %0		\n"
+		     : "=&r" (tmp));
+}
+
+static enum mitigation_state spectre_v2_enable_fw_mitigation(void)
+{
+	bp_hardening_cb_t cb;
+	enum mitigation_state state;
+
+	state = spectre_v2_get_cpu_fw_mitigation_state();
+	if (state != SPECTRE_MITIGATED)
+		return state;
+
+	if (spectre_v2_mitigations_off())
+		return SPECTRE_VULNERABLE;
+
+	switch (arm_smccc_1_1_get_conduit()) {
+	case SMCCC_CONDUIT_HVC:
+		cb = call_hvc_arch_workaround_1;
+		break;
+
+	case SMCCC_CONDUIT_SMC:
+		cb = call_smc_arch_workaround_1;
+		break;
+
+	default:
+		return SPECTRE_VULNERABLE;
+	}
+
+	install_bp_hardening_cb(cb);
+	return SPECTRE_MITIGATED;
+}
+
+static enum mitigation_state spectre_v2_enable_sw_mitigation(void)
+{
+	u32 midr;
+
+	if (spectre_v2_mitigations_off())
+		return SPECTRE_VULNERABLE;
+
+	midr = read_cpuid_id();
+	if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) &&
+	    ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1))
+		return SPECTRE_VULNERABLE;
+
+	install_bp_hardening_cb(qcom_link_stack_sanitisation);
+	return SPECTRE_MITIGATED;
+}
+
+void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
+{
+	enum mitigation_state state;
+
+	WARN_ON(preemptible());
+
+	state = spectre_v2_get_cpu_hw_mitigation_state();
+	if (state == SPECTRE_VULNERABLE)
+		state = spectre_v2_enable_fw_mitigation();
+	if (state == SPECTRE_VULNERABLE)
+		state = spectre_v2_enable_sw_mitigation();
+
+	update_mitigation_state(&spectre_v2_state, state);
+}
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 550dfa3e53cd..413d46b9bc07 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -24,13 +24,13 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 		feature = smccc_get_arg1(vcpu);
 		switch (feature) {
 		case ARM_SMCCC_ARCH_WORKAROUND_1:
-			switch (kvm_arm_harden_branch_predictor()) {
-			case KVM_BP_HARDEN_UNKNOWN:
+			switch (arm64_get_spectre_v2_state()) {
+			case SPECTRE_VULNERABLE:
 				break;
-			case KVM_BP_HARDEN_WA_NEEDED:
+			case SPECTRE_MITIGATED:
 				val = SMCCC_RET_SUCCESS;
 				break;
-			case KVM_BP_HARDEN_NOT_REQUIRED:
+			case SPECTRE_UNAFFECTED:
 				val = SMCCC_RET_NOT_REQUIRED;
 				break;
 			}
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 83415e96b589..fbdd6f3bea7f 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -425,12 +425,12 @@ static int get_kernel_wa_level(u64 regid)
 {
 	switch (regid) {
 	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-		switch (kvm_arm_harden_branch_predictor()) {
-		case KVM_BP_HARDEN_UNKNOWN:
+		switch (arm64_get_spectre_v2_state()) {
+		case SPECTRE_VULNERABLE:
 			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
-		case KVM_BP_HARDEN_WA_NEEDED:
+		case SPECTRE_MITIGATED:
 			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
-		case KVM_BP_HARDEN_NOT_REQUIRED:
+		case SPECTRE_UNAFFECTED:
 			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
 		}
 		return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;

From e1026237f90677fd5a454f63057a62f984c2188d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 16 Sep 2020 00:07:05 +0100
Subject: [PATCH 62/87] KVM: arm64: Set CSV2 for guests on hardware unaffected
 by Spectre-v2

If the system is not affected by Spectre-v2, then advertise to the KVM
guest that it is not affected, without the need for a safelist in the
guest.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kvm/sys_regs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 077293b5115f..7b8a8f6169d0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1131,6 +1131,9 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
 		if (!vcpu_has_sve(vcpu))
 			val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT);
 		val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT);
+		if (!(val & (0xfUL << ID_AA64PFR0_CSV2_SHIFT)) &&
+		    arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED)
+			val |= (1UL << ID_AA64PFR0_CSV2_SHIFT);
 	} else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) {
 		val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) |
 			 (0xfUL << ID_AA64ISAR1_API_SHIFT) |

From a8de949893880a26458de03f5bc70075aba13d95 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 22:20:35 +0100
Subject: [PATCH 63/87] arm64: Group start_thread() functions together

The is_ttbrX_addr() functions have somehow ended up in the middle of
the start_thread() functions, so move them out of the way to keep the
code readable.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/processor.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 436ab1549ec6..d636c3eb9cf6 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -220,18 +220,6 @@ static inline void start_thread(struct pt_regs *regs, unsigned long pc,
 	regs->sp = sp;
 }
 
-static inline bool is_ttbr0_addr(unsigned long addr)
-{
-	/* entry assembly clears tags for TTBR0 addrs */
-	return addr < TASK_SIZE;
-}
-
-static inline bool is_ttbr1_addr(unsigned long addr)
-{
-	/* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
-	return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
-}
-
 #ifdef CONFIG_COMPAT
 static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
 				       unsigned long sp)
@@ -252,6 +240,18 @@ static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
 }
 #endif
 
+static inline bool is_ttbr0_addr(unsigned long addr)
+{
+	/* entry assembly clears tags for TTBR0 addrs */
+	return addr < TASK_SIZE;
+}
+
+static inline bool is_ttbr1_addr(unsigned long addr)
+{
+	/* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
+	return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
+}
+
 /* Forward declaration, a strange C thing */
 struct task_struct;
 

From 532d581583f25d4c297c721700f74b913f8cf37b Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 23:56:12 +0100
Subject: [PATCH 64/87] arm64: Treat SSBS as a non-strict system feature

If all CPUs discovered during boot have SSBS, then spectre-v4 will be
considered to be "mitigated". However, we still allow late CPUs without
SSBS to be onlined, albeit with a "SANITY CHECK" warning. This is
problematic for userspace because it means that the system can quietly
transition to "Vulnerable" at runtime.

Avoid this by treating SSBS as a non-strict system feature: if all of
the CPUs discovered during boot have SSBS, then late arriving secondaries
better have it as well.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/cpufeature.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 4bb45b1d4ae4..033e6c952705 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -227,7 +227,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0),
-	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI),
 				    FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0),
 	ARM64_FTR_END,
@@ -487,7 +487,7 @@ static const struct arm64_ftr_bits ftr_id_pfr1[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_pfr2[] = {
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
@@ -1977,7 +1977,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "Speculative Store Bypassing Safe (SSBS)",
 		.capability = ARM64_SSBS,
-		.type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
 		.matches = has_cpuid_feature,
 		.sys_reg = SYS_ID_AA64PFR1_EL1,
 		.field_pos = ID_AA64PFR1_SSBS_SHIFT,

From 9b0955baa4208abd72840c13c5f56a0f133c7cb3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Sep 2020 23:00:31 +0100
Subject: [PATCH 65/87] arm64: Rename ARM64_SSBD to ARM64_SPECTRE_V4

In a similar manner to the renaming of ARM64_HARDEN_BRANCH_PREDICTOR
to ARM64_SPECTRE_V2, rename ARM64_SSBD to ARM64_SPECTRE_V4. This isn't
_entirely_ accurate, as we also need to take into account the interaction
with SSBS, but that will be taken care of in subsequent patches.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cpucaps.h        | 2 +-
 arch/arm64/kernel/cpu_errata.c          | 2 +-
 arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 348bfcf6c818..c4ac9a13ad5f 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -37,7 +37,7 @@
 #define ARM64_HAS_CACHE_IDC			27
 #define ARM64_HAS_CACHE_DIC			28
 #define ARM64_HW_DBM				29
-#define ARM64_SSBD				30
+#define ARM64_SPECTRE_V4			30
 #define ARM64_MISMATCHED_CACHE_TYPE		31
 #define ARM64_HAS_STAGE2_FWB			32
 #define ARM64_HAS_CRC32				33
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 4103391f51b5..6a48957d44fc 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -675,7 +675,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 #endif
 	{
 		.desc = "Speculative Store Bypass Disable",
-		.capability = ARM64_SSBD,
+		.capability = ARM64_SPECTRE_V4,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
 		.matches = has_ssbd_mitigation,
 		.cpu_enable = cpu_enable_ssbd_mitigation,
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index b503f19c37c5..5dec5589625d 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -481,7 +481,7 @@ exit:
 
 static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu)
 {
-	if (!cpus_have_final_cap(ARM64_SSBD))
+	if (!cpus_have_final_cap(ARM64_SPECTRE_V4))
 		return false;
 
 	return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG);

From 9e78b659b4539ee20fd0c415cf8c231cea59e9c0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 18 Sep 2020 11:45:57 +0100
Subject: [PATCH 66/87] arm64: Move SSBD prctl() handler alongside other
 spectre mitigation code

As part of the spectre consolidation effort to shift all of the ghosts
into their own proton pack, move all of the horrible SSBD prctl() code
out of its own 'ssbd.c' file.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/Makefile      |   2 +-
 arch/arm64/kernel/proton-pack.c | 118 +++++++++++++++++++++++++++++
 arch/arm64/kernel/ssbd.c        | 129 --------------------------------
 3 files changed, 119 insertions(+), 130 deletions(-)
 delete mode 100644 arch/arm64/kernel/ssbd.c

diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 15b0fcbb6883..bd12b9a2ab4a 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   ssbd.o syscall.o proton-pack.o
+			   syscall.o proton-pack.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 4f1411b45301..b373e4782ccd 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -20,6 +20,7 @@
 #include <linux/arm-smccc.h>
 #include <linux/cpu.h>
 #include <linux/device.h>
+#include <linux/nospec.h>
 #include <linux/prctl.h>
 
 #include <asm/spectre.h>
@@ -318,3 +319,120 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
 
 	update_mitigation_state(&spectre_v2_state, state);
 }
+
+/* Spectre v4 prctl */
+static void ssbd_ssbs_enable(struct task_struct *task)
+{
+	u64 val = is_compat_thread(task_thread_info(task)) ?
+		  PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+
+	task_pt_regs(task)->pstate |= val;
+}
+
+static void ssbd_ssbs_disable(struct task_struct *task)
+{
+	u64 val = is_compat_thread(task_thread_info(task)) ?
+		  PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+
+	task_pt_regs(task)->pstate &= ~val;
+}
+
+/*
+ * prctl interface for SSBD
+ */
+static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+	int state = arm64_get_ssbd_state();
+
+	/* Unsupported */
+	if (state == ARM64_SSBD_UNKNOWN)
+		return -ENODEV;
+
+	/* Treat the unaffected/mitigated state separately */
+	if (state == ARM64_SSBD_MITIGATED) {
+		switch (ctrl) {
+		case PR_SPEC_ENABLE:
+			return -EPERM;
+		case PR_SPEC_DISABLE:
+		case PR_SPEC_FORCE_DISABLE:
+			return 0;
+		}
+	}
+
+	/*
+	 * Things are a bit backward here: the arm64 internal API
+	 * *enables the mitigation* when the userspace API *disables
+	 * speculation*. So much fun.
+	 */
+	switch (ctrl) {
+	case PR_SPEC_ENABLE:
+		/* If speculation is force disabled, enable is not allowed */
+		if (state == ARM64_SSBD_FORCE_ENABLE ||
+		    task_spec_ssb_force_disable(task))
+			return -EPERM;
+		task_clear_spec_ssb_disable(task);
+		clear_tsk_thread_flag(task, TIF_SSBD);
+		ssbd_ssbs_enable(task);
+		break;
+	case PR_SPEC_DISABLE:
+		if (state == ARM64_SSBD_FORCE_DISABLE)
+			return -EPERM;
+		task_set_spec_ssb_disable(task);
+		set_tsk_thread_flag(task, TIF_SSBD);
+		ssbd_ssbs_disable(task);
+		break;
+	case PR_SPEC_FORCE_DISABLE:
+		if (state == ARM64_SSBD_FORCE_DISABLE)
+			return -EPERM;
+		task_set_spec_ssb_disable(task);
+		task_set_spec_ssb_force_disable(task);
+		set_tsk_thread_flag(task, TIF_SSBD);
+		ssbd_ssbs_disable(task);
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+			     unsigned long ctrl)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssbd_prctl_set(task, ctrl);
+	default:
+		return -ENODEV;
+	}
+}
+
+static int ssbd_prctl_get(struct task_struct *task)
+{
+	switch (arm64_get_ssbd_state()) {
+	case ARM64_SSBD_UNKNOWN:
+		return -ENODEV;
+	case ARM64_SSBD_FORCE_ENABLE:
+		return PR_SPEC_DISABLE;
+	case ARM64_SSBD_KERNEL:
+		if (task_spec_ssb_force_disable(task))
+			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+		if (task_spec_ssb_disable(task))
+			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+	case ARM64_SSBD_FORCE_DISABLE:
+		return PR_SPEC_ENABLE;
+	default:
+		return PR_SPEC_NOT_AFFECTED;
+	}
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+	switch (which) {
+	case PR_SPEC_STORE_BYPASS:
+		return ssbd_prctl_get(task);
+	default:
+		return -ENODEV;
+	}
+}
diff --git a/arch/arm64/kernel/ssbd.c b/arch/arm64/kernel/ssbd.c
deleted file mode 100644
index b26955f56750..000000000000
--- a/arch/arm64/kernel/ssbd.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
- */
-
-#include <linux/compat.h>
-#include <linux/errno.h>
-#include <linux/prctl.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/thread_info.h>
-
-#include <asm/cpufeature.h>
-
-static void ssbd_ssbs_enable(struct task_struct *task)
-{
-	u64 val = is_compat_thread(task_thread_info(task)) ?
-		  PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
-
-	task_pt_regs(task)->pstate |= val;
-}
-
-static void ssbd_ssbs_disable(struct task_struct *task)
-{
-	u64 val = is_compat_thread(task_thread_info(task)) ?
-		  PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
-
-	task_pt_regs(task)->pstate &= ~val;
-}
-
-/*
- * prctl interface for SSBD
- */
-static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
-{
-	int state = arm64_get_ssbd_state();
-
-	/* Unsupported */
-	if (state == ARM64_SSBD_UNKNOWN)
-		return -ENODEV;
-
-	/* Treat the unaffected/mitigated state separately */
-	if (state == ARM64_SSBD_MITIGATED) {
-		switch (ctrl) {
-		case PR_SPEC_ENABLE:
-			return -EPERM;
-		case PR_SPEC_DISABLE:
-		case PR_SPEC_FORCE_DISABLE:
-			return 0;
-		}
-	}
-
-	/*
-	 * Things are a bit backward here: the arm64 internal API
-	 * *enables the mitigation* when the userspace API *disables
-	 * speculation*. So much fun.
-	 */
-	switch (ctrl) {
-	case PR_SPEC_ENABLE:
-		/* If speculation is force disabled, enable is not allowed */
-		if (state == ARM64_SSBD_FORCE_ENABLE ||
-		    task_spec_ssb_force_disable(task))
-			return -EPERM;
-		task_clear_spec_ssb_disable(task);
-		clear_tsk_thread_flag(task, TIF_SSBD);
-		ssbd_ssbs_enable(task);
-		break;
-	case PR_SPEC_DISABLE:
-		if (state == ARM64_SSBD_FORCE_DISABLE)
-			return -EPERM;
-		task_set_spec_ssb_disable(task);
-		set_tsk_thread_flag(task, TIF_SSBD);
-		ssbd_ssbs_disable(task);
-		break;
-	case PR_SPEC_FORCE_DISABLE:
-		if (state == ARM64_SSBD_FORCE_DISABLE)
-			return -EPERM;
-		task_set_spec_ssb_disable(task);
-		task_set_spec_ssb_force_disable(task);
-		set_tsk_thread_flag(task, TIF_SSBD);
-		ssbd_ssbs_disable(task);
-		break;
-	default:
-		return -ERANGE;
-	}
-
-	return 0;
-}
-
-int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
-			     unsigned long ctrl)
-{
-	switch (which) {
-	case PR_SPEC_STORE_BYPASS:
-		return ssbd_prctl_set(task, ctrl);
-	default:
-		return -ENODEV;
-	}
-}
-
-static int ssbd_prctl_get(struct task_struct *task)
-{
-	switch (arm64_get_ssbd_state()) {
-	case ARM64_SSBD_UNKNOWN:
-		return -ENODEV;
-	case ARM64_SSBD_FORCE_ENABLE:
-		return PR_SPEC_DISABLE;
-	case ARM64_SSBD_KERNEL:
-		if (task_spec_ssb_force_disable(task))
-			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
-		if (task_spec_ssb_disable(task))
-			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
-		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
-	case ARM64_SSBD_FORCE_DISABLE:
-		return PR_SPEC_ENABLE;
-	default:
-		return PR_SPEC_NOT_AFFECTED;
-	}
-}
-
-int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
-{
-	switch (which) {
-	case PR_SPEC_STORE_BYPASS:
-		return ssbd_prctl_get(task);
-	default:
-		return -ENODEV;
-	}
-}

From c28762070ca651fe7a981b8f31d972c9b7d2c386 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 18 Sep 2020 11:54:33 +0100
Subject: [PATCH 67/87] arm64: Rewrite Spectre-v4 mitigation code

Rewrite the Spectre-v4 mitigation handling code to follow the same
approach as that taken by Spectre-v2.

For now, report to KVM that the system is vulnerable (by forcing
'ssbd_state' to ARM64_SSBD_UNKNOWN), as this will be cleared up in
subsequent steps.

Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/processor.h |  19 +-
 arch/arm64/include/asm/spectre.h   |   5 +
 arch/arm64/kernel/cpu_errata.c     | 217 +-------------
 arch/arm64/kernel/cpufeature.c     |  41 ---
 arch/arm64/kernel/entry.S          |   6 +-
 arch/arm64/kernel/hibernate.c      |   6 +-
 arch/arm64/kernel/process.c        |  17 +-
 arch/arm64/kernel/proton-pack.c    | 447 +++++++++++++++++++++++++----
 arch/arm64/kernel/suspend.c        |   3 +-
 9 files changed, 405 insertions(+), 356 deletions(-)

diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index d636c3eb9cf6..7d90ea2e2063 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -198,25 +198,12 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
 		regs->pmr_save = GIC_PRIO_IRQON;
 }
 
-static inline void set_ssbs_bit(struct pt_regs *regs)
-{
-	regs->pstate |= PSR_SSBS_BIT;
-}
-
-static inline void set_compat_ssbs_bit(struct pt_regs *regs)
-{
-	regs->pstate |= PSR_AA32_SSBS_BIT;
-}
-
 static inline void start_thread(struct pt_regs *regs, unsigned long pc,
 				unsigned long sp)
 {
 	start_thread_common(regs, pc);
 	regs->pstate = PSR_MODE_EL0t;
-
-	if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
-		set_ssbs_bit(regs);
-
+	spectre_v4_enable_task_mitigation(current);
 	regs->sp = sp;
 }
 
@@ -233,9 +220,7 @@ static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
 	regs->pstate |= PSR_AA32_E_BIT;
 #endif
 
-	if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
-		set_compat_ssbs_bit(regs);
-
+	spectre_v4_enable_task_mitigation(current);
 	regs->compat_sp = sp;
 }
 #endif
diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
index b776abe28dff..fcdfbce302bd 100644
--- a/arch/arm64/include/asm/spectre.h
+++ b/arch/arm64/include/asm/spectre.h
@@ -24,4 +24,9 @@ enum mitigation_state arm64_get_spectre_v2_state(void);
 bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope);
 void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
 
+enum mitigation_state arm64_get_spectre_v4_state(void);
+bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope);
+void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused);
+void spectre_v4_enable_task_mitigation(struct task_struct *tsk);
+
 #endif	/* __ASM_SPECTRE_H */
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 6a48957d44fc..7fc54c3d4285 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -106,62 +106,7 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap)
 		sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
 }
 
-DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
-
-int ssbd_state __read_mostly = ARM64_SSBD_KERNEL;
-static bool __ssb_safe = true;
-
-static const struct ssbd_options {
-	const char	*str;
-	int		state;
-} ssbd_options[] = {
-	{ "force-on",	ARM64_SSBD_FORCE_ENABLE, },
-	{ "force-off",	ARM64_SSBD_FORCE_DISABLE, },
-	{ "kernel",	ARM64_SSBD_KERNEL, },
-};
-
-static int __init ssbd_cfg(char *buf)
-{
-	int i;
-
-	if (!buf || !buf[0])
-		return -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(ssbd_options); i++) {
-		int len = strlen(ssbd_options[i].str);
-
-		if (strncmp(buf, ssbd_options[i].str, len))
-			continue;
-
-		ssbd_state = ssbd_options[i].state;
-		return 0;
-	}
-
-	return -EINVAL;
-}
-early_param("ssbd", ssbd_cfg);
-
-void __init arm64_update_smccc_conduit(struct alt_instr *alt,
-				       __le32 *origptr, __le32 *updptr,
-				       int nr_inst)
-{
-	u32 insn;
-
-	BUG_ON(nr_inst != 1);
-
-	switch (arm_smccc_1_1_get_conduit()) {
-	case SMCCC_CONDUIT_HVC:
-		insn = aarch64_insn_get_hvc_value();
-		break;
-	case SMCCC_CONDUIT_SMC:
-		insn = aarch64_insn_get_smc_value();
-		break;
-	default:
-		return;
-	}
-
-	*updptr = cpu_to_le32(insn);
-}
+int ssbd_state __read_mostly = ARM64_SSBD_UNKNOWN;
 
 void __init arm64_enable_wa2_handling(struct alt_instr *alt,
 				      __le32 *origptr, __le32 *updptr,
@@ -177,144 +122,6 @@ void __init arm64_enable_wa2_handling(struct alt_instr *alt,
 		*updptr = cpu_to_le32(aarch64_insn_gen_nop());
 }
 
-void arm64_set_ssbd_mitigation(bool state)
-{
-	int conduit;
-
-	if (this_cpu_has_cap(ARM64_SSBS)) {
-		if (state)
-			asm volatile(SET_PSTATE_SSBS(0));
-		else
-			asm volatile(SET_PSTATE_SSBS(1));
-		return;
-	}
-
-	conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, state,
-				       NULL);
-
-	WARN_ON_ONCE(conduit == SMCCC_CONDUIT_NONE);
-}
-
-static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry,
-				    int scope)
-{
-	struct arm_smccc_res res;
-	bool required = true;
-	s32 val;
-	bool this_cpu_safe = false;
-	int conduit;
-
-	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
-
-	if (cpu_mitigations_off())
-		ssbd_state = ARM64_SSBD_FORCE_DISABLE;
-
-	/* delay setting __ssb_safe until we get a firmware response */
-	if (is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list))
-		this_cpu_safe = true;
-
-	if (this_cpu_has_cap(ARM64_SSBS)) {
-		if (!this_cpu_safe)
-			__ssb_safe = false;
-		required = false;
-		goto out_printmsg;
-	}
-
-	conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
-				       ARM_SMCCC_ARCH_WORKAROUND_2, &res);
-
-	if (conduit == SMCCC_CONDUIT_NONE) {
-		ssbd_state = ARM64_SSBD_UNKNOWN;
-		if (!this_cpu_safe)
-			__ssb_safe = false;
-		return false;
-	}
-
-	val = (s32)res.a0;
-
-	switch (val) {
-	case SMCCC_RET_NOT_SUPPORTED:
-		ssbd_state = ARM64_SSBD_UNKNOWN;
-		if (!this_cpu_safe)
-			__ssb_safe = false;
-		return false;
-
-	/* machines with mixed mitigation requirements must not return this */
-	case SMCCC_RET_NOT_REQUIRED:
-		pr_info_once("%s mitigation not required\n", entry->desc);
-		ssbd_state = ARM64_SSBD_MITIGATED;
-		return false;
-
-	case SMCCC_RET_SUCCESS:
-		__ssb_safe = false;
-		required = true;
-		break;
-
-	case 1:	/* Mitigation not required on this CPU */
-		required = false;
-		break;
-
-	default:
-		WARN_ON(1);
-		if (!this_cpu_safe)
-			__ssb_safe = false;
-		return false;
-	}
-
-	switch (ssbd_state) {
-	case ARM64_SSBD_FORCE_DISABLE:
-		arm64_set_ssbd_mitigation(false);
-		required = false;
-		break;
-
-	case ARM64_SSBD_KERNEL:
-		if (required) {
-			__this_cpu_write(arm64_ssbd_callback_required, 1);
-			arm64_set_ssbd_mitigation(true);
-		}
-		break;
-
-	case ARM64_SSBD_FORCE_ENABLE:
-		arm64_set_ssbd_mitigation(true);
-		required = true;
-		break;
-
-	default:
-		WARN_ON(1);
-		break;
-	}
-
-out_printmsg:
-	switch (ssbd_state) {
-	case ARM64_SSBD_FORCE_DISABLE:
-		pr_info_once("%s disabled from command-line\n", entry->desc);
-		break;
-
-	case ARM64_SSBD_FORCE_ENABLE:
-		pr_info_once("%s forced from command-line\n", entry->desc);
-		break;
-	}
-
-	return required;
-}
-
-static void cpu_enable_ssbd_mitigation(const struct arm64_cpu_capabilities *cap)
-{
-	if (ssbd_state != ARM64_SSBD_FORCE_DISABLE)
-		cap->matches(cap, SCOPE_LOCAL_CPU);
-}
-
-/* known invulnerable cores */
-static const struct midr_range arm64_ssb_cpus[] = {
-	MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
-	MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
-	MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
-	MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
-	MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
-	MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
-	{},
-};
-
 #ifdef CONFIG_ARM64_ERRATUM_1463225
 DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
 
@@ -674,12 +481,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	},
 #endif
 	{
-		.desc = "Speculative Store Bypass Disable",
+		.desc = "Spectre-v4",
 		.capability = ARM64_SPECTRE_V4,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
-		.matches = has_ssbd_mitigation,
-		.cpu_enable = cpu_enable_ssbd_mitigation,
-		.midr_range_list = arm64_ssb_cpus,
+		.matches = has_spectre_v4,
+		.cpu_enable = spectre_v4_enable_mitigation,
 	},
 #ifdef CONFIG_ARM64_ERRATUM_1418040
 	{
@@ -732,18 +538,3 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	{
 	}
 };
-
-ssize_t cpu_show_spec_store_bypass(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	if (__ssb_safe)
-		return sprintf(buf, "Not affected\n");
-
-	switch (ssbd_state) {
-	case ARM64_SSBD_KERNEL:
-	case ARM64_SSBD_FORCE_ENABLE:
-		return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
-	}
-
-	return sprintf(buf, "Vulnerable\n");
-}
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 033e6c952705..a4debb63ebfb 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1583,46 +1583,6 @@ static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused)
 	WARN_ON(val & (7 << 27 | 7 << 21));
 }
 
-static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
-{
-	if (user_mode(regs))
-		return 1;
-
-	if (instr & BIT(PSTATE_Imm_shift))
-		regs->pstate |= PSR_SSBS_BIT;
-	else
-		regs->pstate &= ~PSR_SSBS_BIT;
-
-	arm64_skip_faulting_instruction(regs, 4);
-	return 0;
-}
-
-static struct undef_hook ssbs_emulation_hook = {
-	.instr_mask	= ~(1U << PSTATE_Imm_shift),
-	.instr_val	= 0xd500401f | PSTATE_SSBS,
-	.fn		= ssbs_emulation_handler,
-};
-
-static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused)
-{
-	static bool undef_hook_registered = false;
-	static DEFINE_RAW_SPINLOCK(hook_lock);
-
-	raw_spin_lock(&hook_lock);
-	if (!undef_hook_registered) {
-		register_undef_hook(&ssbs_emulation_hook);
-		undef_hook_registered = true;
-	}
-	raw_spin_unlock(&hook_lock);
-
-	if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
-		sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
-		arm64_set_ssbd_mitigation(false);
-	} else {
-		arm64_set_ssbd_mitigation(true);
-	}
-}
-
 #ifdef CONFIG_ARM64_PAN
 static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
 {
@@ -1983,7 +1943,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.field_pos = ID_AA64PFR1_SSBS_SHIFT,
 		.sign = FTR_UNSIGNED,
 		.min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY,
-		.cpu_enable = cpu_enable_ssbs,
 	},
 #ifdef CONFIG_ARM64_CNP
 	{
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 81b709349d7b..aeb337029d56 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -132,8 +132,8 @@ alternative_else_nop_endif
 	 * them if required.
 	 */
 	.macro	apply_ssbd, state, tmp1, tmp2
-alternative_cb	arm64_enable_wa2_handling
-	b	.L__asm_ssbd_skip\@
+alternative_cb	spectre_v4_patch_fw_mitigation_enable
+	b	.L__asm_ssbd_skip\@		// Patched to NOP
 alternative_cb_end
 	ldr_this_cpu	\tmp2, arm64_ssbd_callback_required, \tmp1
 	cbz	\tmp2,	.L__asm_ssbd_skip\@
@@ -141,7 +141,7 @@ alternative_cb_end
 	tbnz	\tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@
 	mov	w0, #ARM_SMCCC_ARCH_WORKAROUND_2
 	mov	w1, #\state
-alternative_cb	arm64_update_smccc_conduit
+alternative_cb	spectre_v4_patch_fw_mitigation_conduit
 	nop					// Patched to SMC/HVC #0
 alternative_cb_end
 .L__asm_ssbd_skip\@:
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 68e14152d6e9..c7b00120dc3e 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -332,11 +332,7 @@ int swsusp_arch_suspend(void)
 		 * mitigation off behind our back, let's set the state
 		 * to what we expect it to be.
 		 */
-		switch (arm64_get_ssbd_state()) {
-		case ARM64_SSBD_FORCE_ENABLE:
-		case ARM64_SSBD_KERNEL:
-			arm64_set_ssbd_mitigation(true);
-		}
+		spectre_v4_enable_mitigation(NULL);
 	}
 
 	local_daif_restore(flags);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f1804496b935..9dbd35b95253 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -421,8 +421,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 		    cpus_have_const_cap(ARM64_HAS_UAO))
 			childregs->pstate |= PSR_UAO_BIT;
 
-		if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
-			set_ssbs_bit(childregs);
+		spectre_v4_enable_task_mitigation(p);
 
 		if (system_uses_irq_prio_masking())
 			childregs->pmr_save = GIC_PRIO_IRQON;
@@ -472,8 +471,6 @@ void uao_thread_switch(struct task_struct *next)
  */
 static void ssbs_thread_switch(struct task_struct *next)
 {
-	struct pt_regs *regs = task_pt_regs(next);
-
 	/*
 	 * Nothing to do for kernel threads, but 'regs' may be junk
 	 * (e.g. idle task) so check the flags and bail early.
@@ -485,18 +482,10 @@ static void ssbs_thread_switch(struct task_struct *next)
 	 * If all CPUs implement the SSBS extension, then we just need to
 	 * context-switch the PSTATE field.
 	 */
-	if (cpu_have_feature(cpu_feature(SSBS)))
+	if (cpus_have_const_cap(ARM64_SSBS))
 		return;
 
-	/* If the mitigation is enabled, then we leave SSBS clear. */
-	if ((arm64_get_ssbd_state() == ARM64_SSBD_FORCE_ENABLE) ||
-	    test_tsk_thread_flag(next, TIF_SSBD))
-		return;
-
-	if (compat_user_mode(regs))
-		set_compat_ssbs_bit(regs);
-	else if (user_mode(regs))
-		set_ssbs_bit(regs);
+	spectre_v4_enable_task_mitigation(next);
 }
 
 /*
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index b373e4782ccd..b1ea935fd948 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -320,79 +320,394 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
 	update_mitigation_state(&spectre_v2_state, state);
 }
 
-/* Spectre v4 prctl */
-static void ssbd_ssbs_enable(struct task_struct *task)
-{
-	u64 val = is_compat_thread(task_thread_info(task)) ?
-		  PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+/*
+ * Spectre v4.
+ *
+ * If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is
+ * either:
+ *
+ * - Mitigated in hardware and listed in our "safe list".
+ * - Mitigated in hardware via PSTATE.SSBS.
+ * - Mitigated in software by firmware (sometimes referred to as SSBD).
+ *
+ * Wait, that doesn't sound so bad, does it? Keep reading...
+ *
+ * A major source of headaches is that the software mitigation is enabled both
+ * on a per-task basis, but can also be forced on for the kernel, necessitating
+ * both context-switch *and* entry/exit hooks. To make it even worse, some CPUs
+ * allow EL0 to toggle SSBS directly, which can end up with the prctl() state
+ * being stale when re-entering the kernel. The usual big.LITTLE caveats apply,
+ * so you can have systems that have both firmware and SSBS mitigations. This
+ * means we actually have to reject late onlining of CPUs with mitigations if
+ * all of the currently onlined CPUs are safelisted, as the mitigation tends to
+ * be opt-in for userspace. Yes, really, the cure is worse than the disease.
+ *
+ * The only good part is that if the firmware mitigation is present, then it is
+ * present for all CPUs, meaning we don't have to worry about late onlining of a
+ * vulnerable CPU if one of the boot CPUs is using the firmware mitigation.
+ *
+ * Give me a VAX-11/780 any day of the week...
+ */
+static enum mitigation_state spectre_v4_state;
 
-	task_pt_regs(task)->pstate |= val;
+/* This is the per-cpu state tracking whether we need to talk to firmware */
+DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
+
+enum spectre_v4_policy {
+	SPECTRE_V4_POLICY_MITIGATION_DYNAMIC,
+	SPECTRE_V4_POLICY_MITIGATION_ENABLED,
+	SPECTRE_V4_POLICY_MITIGATION_DISABLED,
+};
+
+static enum spectre_v4_policy __read_mostly __spectre_v4_policy;
+
+static const struct spectre_v4_param {
+	const char		*str;
+	enum spectre_v4_policy	policy;
+} spectre_v4_params[] = {
+	{ "force-on",	SPECTRE_V4_POLICY_MITIGATION_ENABLED, },
+	{ "force-off",	SPECTRE_V4_POLICY_MITIGATION_DISABLED, },
+	{ "kernel",	SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, },
+};
+static int __init parse_spectre_v4_param(char *str)
+{
+	int i;
+
+	if (!str || !str[0])
+		return -EINVAL;
+
+	for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) {
+		const struct spectre_v4_param *param = &spectre_v4_params[i];
+
+		if (strncmp(str, param->str, strlen(param->str)))
+			continue;
+
+		__spectre_v4_policy = param->policy;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+early_param("ssbd", parse_spectre_v4_param);
+
+/*
+ * Because this was all written in a rush by people working in different silos,
+ * we've ended up with multiple command line options to control the same thing.
+ * Wrap these up in some helpers, which prefer disabling the mitigation if faced
+ * with contradictory parameters. The mitigation is always either "off",
+ * "dynamic" or "on".
+ */
+static bool spectre_v4_mitigations_off(void)
+{
+	bool ret = cpu_mitigations_off() ||
+		   __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
+
+	if (ret)
+		pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
+
+	return ret;
 }
 
-static void ssbd_ssbs_disable(struct task_struct *task)
+/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
+static bool spectre_v4_mitigations_dynamic(void)
 {
-	u64 val = is_compat_thread(task_thread_info(task)) ?
-		  PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+	return !spectre_v4_mitigations_off() &&
+	       __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC;
+}
 
-	task_pt_regs(task)->pstate &= ~val;
+static bool spectre_v4_mitigations_on(void)
+{
+	return !spectre_v4_mitigations_off() &&
+	       __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED;
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	switch (spectre_v4_state) {
+	case SPECTRE_UNAFFECTED:
+		return sprintf(buf, "Not affected\n");
+	case SPECTRE_MITIGATED:
+		return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
+	case SPECTRE_VULNERABLE:
+		fallthrough;
+	default:
+		return sprintf(buf, "Vulnerable\n");
+	}
+}
+
+enum mitigation_state arm64_get_spectre_v4_state(void)
+{
+	return spectre_v4_state;
+}
+
+static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void)
+{
+	static const struct midr_range spectre_v4_safe_list[] = {
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
+		MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
+		MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
+		MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
+		MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
+		{ /* sentinel */ },
+	};
+
+	if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list))
+		return SPECTRE_UNAFFECTED;
+
+	/* CPU features are detected first */
+	if (this_cpu_has_cap(ARM64_SSBS))
+		return SPECTRE_MITIGATED;
+
+	return SPECTRE_VULNERABLE;
+}
+
+static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void)
+{
+	int ret;
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+			     ARM_SMCCC_ARCH_WORKAROUND_2, &res);
+
+	ret = res.a0;
+	switch (ret) {
+	case SMCCC_RET_SUCCESS:
+		return SPECTRE_MITIGATED;
+	case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
+		fallthrough;
+	case SMCCC_RET_NOT_REQUIRED:
+		return SPECTRE_UNAFFECTED;
+	default:
+		fallthrough;
+	case SMCCC_RET_NOT_SUPPORTED:
+		return SPECTRE_VULNERABLE;
+	}
+}
+
+bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope)
+{
+	enum mitigation_state state;
+
+	WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
+
+	state = spectre_v4_get_cpu_hw_mitigation_state();
+	if (state == SPECTRE_VULNERABLE)
+		state = spectre_v4_get_cpu_fw_mitigation_state();
+
+	return state != SPECTRE_UNAFFECTED;
+}
+
+static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr)
+{
+	if (user_mode(regs))
+		return 1;
+
+	if (instr & BIT(PSTATE_Imm_shift))
+		regs->pstate |= PSR_SSBS_BIT;
+	else
+		regs->pstate &= ~PSR_SSBS_BIT;
+
+	arm64_skip_faulting_instruction(regs, 4);
+	return 0;
+}
+
+static struct undef_hook ssbs_emulation_hook = {
+	.instr_mask	= ~(1U << PSTATE_Imm_shift),
+	.instr_val	= 0xd500401f | PSTATE_SSBS,
+	.fn		= ssbs_emulation_handler,
+};
+
+static enum mitigation_state spectre_v4_enable_hw_mitigation(void)
+{
+	static bool undef_hook_registered = false;
+	static DEFINE_RAW_SPINLOCK(hook_lock);
+	enum mitigation_state state;
+
+	/*
+	 * If the system is mitigated but this CPU doesn't have SSBS, then
+	 * we must be on the safelist and there's nothing more to do.
+	 */
+	state = spectre_v4_get_cpu_hw_mitigation_state();
+	if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS))
+		return state;
+
+	raw_spin_lock(&hook_lock);
+	if (!undef_hook_registered) {
+		register_undef_hook(&ssbs_emulation_hook);
+		undef_hook_registered = true;
+	}
+	raw_spin_unlock(&hook_lock);
+
+	if (spectre_v4_mitigations_off()) {
+		sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
+		asm volatile(SET_PSTATE_SSBS(1));
+		return SPECTRE_VULNERABLE;
+	}
+
+	/* SCTLR_EL1.DSSBS was initialised to 0 during boot */
+	asm volatile(SET_PSTATE_SSBS(0));
+	return SPECTRE_MITIGATED;
 }
 
 /*
- * prctl interface for SSBD
+ * Patch a branch over the Spectre-v4 mitigation code with a NOP so that
+ * we fallthrough and check whether firmware needs to be called on this CPU.
+ */
+void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt,
+						  __le32 *origptr,
+						  __le32 *updptr, int nr_inst)
+{
+	BUG_ON(nr_inst != 1); /* Branch -> NOP */
+
+	if (spectre_v4_mitigations_off())
+		return;
+
+	if (cpus_have_final_cap(ARM64_SSBS))
+		return;
+
+	if (spectre_v4_mitigations_dynamic())
+		*updptr = cpu_to_le32(aarch64_insn_gen_nop());
+}
+
+/*
+ * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction
+ * to call into firmware to adjust the mitigation state.
+ */
+void __init spectre_v4_patch_fw_mitigation_conduit(struct alt_instr *alt,
+						   __le32 *origptr,
+						   __le32 *updptr, int nr_inst)
+{
+	u32 insn;
+
+	BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */
+
+	switch (arm_smccc_1_1_get_conduit()) {
+	case SMCCC_CONDUIT_HVC:
+		insn = aarch64_insn_get_hvc_value();
+		break;
+	case SMCCC_CONDUIT_SMC:
+		insn = aarch64_insn_get_smc_value();
+		break;
+	default:
+		return;
+	}
+
+	*updptr = cpu_to_le32(insn);
+}
+
+static enum mitigation_state spectre_v4_enable_fw_mitigation(void)
+{
+	enum mitigation_state state;
+
+	state = spectre_v4_get_cpu_fw_mitigation_state();
+	if (state != SPECTRE_MITIGATED)
+		return state;
+
+	if (spectre_v4_mitigations_off()) {
+		arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL);
+		return SPECTRE_VULNERABLE;
+	}
+
+	arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL);
+
+	if (spectre_v4_mitigations_dynamic())
+		__this_cpu_write(arm64_ssbd_callback_required, 1);
+
+	return SPECTRE_MITIGATED;
+}
+
+void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
+{
+	enum mitigation_state state;
+
+	WARN_ON(preemptible());
+
+	state = spectre_v4_enable_hw_mitigation();
+	if (state == SPECTRE_VULNERABLE)
+		state = spectre_v4_enable_fw_mitigation();
+
+	update_mitigation_state(&spectre_v4_state, state);
+}
+
+static void __update_pstate_ssbs(struct pt_regs *regs, bool state)
+{
+	u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;
+
+	if (state)
+		regs->pstate |= bit;
+	else
+		regs->pstate &= ~bit;
+}
+
+void spectre_v4_enable_task_mitigation(struct task_struct *tsk)
+{
+	struct pt_regs *regs = task_pt_regs(tsk);
+	bool ssbs = false, kthread = tsk->flags & PF_KTHREAD;
+
+	if (spectre_v4_mitigations_off())
+		ssbs = true;
+	else if (spectre_v4_mitigations_dynamic() && !kthread)
+		ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD);
+
+	__update_pstate_ssbs(regs, ssbs);
+}
+
+/*
+ * The Spectre-v4 mitigation can be controlled via a prctl() from userspace.
+ * This is interesting because the "speculation disabled" behaviour can be
+ * configured so that it is preserved across exec(), which means that the
+ * prctl() may be necessary even when PSTATE.SSBS can be toggled directly
+ * from userspace.
  */
 static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
 {
-	int state = arm64_get_ssbd_state();
-
-	/* Unsupported */
-	if (state == ARM64_SSBD_UNKNOWN)
-		return -ENODEV;
-
-	/* Treat the unaffected/mitigated state separately */
-	if (state == ARM64_SSBD_MITIGATED) {
-		switch (ctrl) {
-		case PR_SPEC_ENABLE:
-			return -EPERM;
-		case PR_SPEC_DISABLE:
-		case PR_SPEC_FORCE_DISABLE:
-			return 0;
-		}
-	}
-
-	/*
-	 * Things are a bit backward here: the arm64 internal API
-	 * *enables the mitigation* when the userspace API *disables
-	 * speculation*. So much fun.
-	 */
 	switch (ctrl) {
 	case PR_SPEC_ENABLE:
-		/* If speculation is force disabled, enable is not allowed */
-		if (state == ARM64_SSBD_FORCE_ENABLE ||
-		    task_spec_ssb_force_disable(task))
+		/* Enable speculation: disable mitigation */
+		/*
+		 * Force disabled speculation prevents it from being
+		 * re-enabled.
+		 */
+		if (task_spec_ssb_force_disable(task))
 			return -EPERM;
+
+		/*
+		 * If the mitigation is forced on, then speculation is forced
+		 * off and we again prevent it from being re-enabled.
+		 */
+		if (spectre_v4_mitigations_on())
+			return -EPERM;
+
 		task_clear_spec_ssb_disable(task);
 		clear_tsk_thread_flag(task, TIF_SSBD);
-		ssbd_ssbs_enable(task);
-		break;
-	case PR_SPEC_DISABLE:
-		if (state == ARM64_SSBD_FORCE_DISABLE)
-			return -EPERM;
-		task_set_spec_ssb_disable(task);
-		set_tsk_thread_flag(task, TIF_SSBD);
-		ssbd_ssbs_disable(task);
 		break;
 	case PR_SPEC_FORCE_DISABLE:
-		if (state == ARM64_SSBD_FORCE_DISABLE)
+		/* Force disable speculation: force enable mitigation */
+		/*
+		 * If the mitigation is forced off, then speculation is forced
+		 * on and we prevent it from being disabled.
+		 */
+		if (spectre_v4_mitigations_off())
 			return -EPERM;
-		task_set_spec_ssb_disable(task);
+
 		task_set_spec_ssb_force_disable(task);
+		fallthrough;
+	case PR_SPEC_DISABLE:
+		/* Disable speculation: enable mitigation */
+		/* Same as PR_SPEC_FORCE_DISABLE */
+		if (spectre_v4_mitigations_off())
+			return -EPERM;
+
+		task_set_spec_ssb_disable(task);
 		set_tsk_thread_flag(task, TIF_SSBD);
-		ssbd_ssbs_disable(task);
 		break;
 	default:
 		return -ERANGE;
 	}
 
+	spectre_v4_enable_task_mitigation(task);
 	return 0;
 }
 
@@ -409,22 +724,32 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
 
 static int ssbd_prctl_get(struct task_struct *task)
 {
-	switch (arm64_get_ssbd_state()) {
-	case ARM64_SSBD_UNKNOWN:
-		return -ENODEV;
-	case ARM64_SSBD_FORCE_ENABLE:
-		return PR_SPEC_DISABLE;
-	case ARM64_SSBD_KERNEL:
-		if (task_spec_ssb_force_disable(task))
-			return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
-		if (task_spec_ssb_disable(task))
-			return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
-		return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
-	case ARM64_SSBD_FORCE_DISABLE:
-		return PR_SPEC_ENABLE;
-	default:
+	switch (spectre_v4_state) {
+	case SPECTRE_UNAFFECTED:
 		return PR_SPEC_NOT_AFFECTED;
+	case SPECTRE_MITIGATED:
+		if (spectre_v4_mitigations_on())
+			return PR_SPEC_NOT_AFFECTED;
+
+		if (spectre_v4_mitigations_dynamic())
+			break;
+
+		/* Mitigations are disabled, so we're vulnerable. */
+		fallthrough;
+	case SPECTRE_VULNERABLE:
+		fallthrough;
+	default:
+		return PR_SPEC_ENABLE;
 	}
+
+	/* Check the mitigation state for this task */
+	if (task_spec_ssb_force_disable(task))
+		return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+
+	if (task_spec_ssb_disable(task))
+		return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+
+	return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
 }
 
 int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index c1dee9066ff9..584c14ce3c86 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -72,8 +72,7 @@ void notrace __cpu_suspend_exit(void)
 	 * have turned the mitigation on. If the user has forcefully
 	 * disabled it, make sure their wishes are obeyed.
 	 */
-	if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE)
-		arm64_set_ssbd_mitigation(false);
+	spectre_v4_enable_mitigation(NULL);
 }
 
 /*

From 29e8910a566aad3ee72f729e45842858d51ced8d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Fri, 18 Sep 2020 12:25:40 +0100
Subject: [PATCH 68/87] KVM: arm64: Simplify handling of ARCH_WORKAROUND_2

Owing to the fact that the host kernel is always mitigated, we can
drastically simplify the WA2 handling by keeping the mitigation
state ON when entering the guest. This means the guest is either
unaffected or not mitigated.

This results in a nice simplification of the mitigation space,
and the removal of a lot of code that was never really used anyway.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kvm_asm.h        |  3 --
 arch/arm64/include/asm/kvm_emulate.h    | 14 ------
 arch/arm64/include/asm/kvm_mmu.h        | 17 -------
 arch/arm64/include/uapi/asm/kvm.h       |  9 ++++
 arch/arm64/kernel/cpu_errata.c          | 14 ------
 arch/arm64/kernel/image-vars.h          |  2 -
 arch/arm64/kvm/arm.c                    |  4 --
 arch/arm64/kvm/hyp/hyp-entry.S          | 27 -----------
 arch/arm64/kvm/hyp/include/hyp/switch.h | 29 ------------
 arch/arm64/kvm/hyp/nvhe/switch.c        |  4 --
 arch/arm64/kvm/hyp/vhe/switch.c         |  4 --
 arch/arm64/kvm/hypercalls.c             | 14 +++---
 arch/arm64/kvm/psci.c                   | 59 +++++++++++--------------
 arch/arm64/kvm/reset.c                  |  4 --
 14 files changed, 41 insertions(+), 163 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index e9378cc8049d..abe02cf66880 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -9,9 +9,6 @@
 
 #include <asm/virt.h>
 
-#define	VCPU_WORKAROUND_2_FLAG_SHIFT	0
-#define	VCPU_WORKAROUND_2_FLAG		(_AC(1, UL) << VCPU_WORKAROUND_2_FLAG_SHIFT)
-
 #define ARM_EXIT_WITH_SERROR_BIT  31
 #define ARM_EXCEPTION_CODE(x)	  ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT))
 #define ARM_EXCEPTION_IS_TRAP(x)  (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP)
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 49a55be2b9a2..96eccb107ec2 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -383,20 +383,6 @@ static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
 	return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
 }
 
-static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG;
-}
-
-static inline void kvm_arm_set_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu,
-						      bool flag)
-{
-	if (flag)
-		vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
-	else
-		vcpu->arch.workaround_flags &= ~VCPU_WORKAROUND_2_FLAG;
-}
-
 static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
 {
 	if (vcpu_mode_is_32bit(vcpu)) {
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 873e12430ac7..36606ef9e435 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -526,23 +526,6 @@ static inline int kvm_map_vectors(void)
 }
 #endif
 
-DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
-
-static inline int hyp_map_aux_data(void)
-{
-	int cpu, err;
-
-	for_each_possible_cpu(cpu) {
-		u64 *ptr;
-
-		ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu);
-		err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP);
-		if (err)
-			return err;
-	}
-	return 0;
-}
-
 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
 
 /*
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index ba85bb23f060..7d804fd0a682 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -242,6 +242,15 @@ struct kvm_vcpu_events {
 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL		0
 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL		1
 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED	2
+
+/*
+ * Only two states can be presented by the host kernel:
+ * - NOT_REQUIRED: the guest doesn't need to do anything
+ * - NOT_AVAIL: the guest isn't mitigated (it can still use SSBS if available)
+ *
+ * All the other values are deprecated. The host still accepts all
+ * values (they are ABI), but will narrow them to the above two.
+ */
 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2	KVM_REG_ARM_FW_REG(2)
 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL		0
 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN		1
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 7fc54c3d4285..7e9caef13db4 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -108,20 +108,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap)
 
 int ssbd_state __read_mostly = ARM64_SSBD_UNKNOWN;
 
-void __init arm64_enable_wa2_handling(struct alt_instr *alt,
-				      __le32 *origptr, __le32 *updptr,
-				      int nr_inst)
-{
-	BUG_ON(nr_inst != 1);
-	/*
-	 * Only allow mitigation on EL1 entry/exit and guest
-	 * ARCH_WORKAROUND_2 handling if the SSBD state allows it to
-	 * be flipped.
-	 */
-	if (arm64_get_ssbd_state() == ARM64_SSBD_KERNEL)
-		*updptr = cpu_to_le32(aarch64_insn_gen_nop());
-}
-
 #ifdef CONFIG_ARM64_ERRATUM_1463225
 DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
 
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8982b68289b7..d0f3f35dd0d7 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -64,12 +64,10 @@ __efistub__ctype		= _ctype;
 #define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym;
 
 /* Alternative callbacks for init-time patching of nVHE hyp code. */
-KVM_NVHE_ALIAS(arm64_enable_wa2_handling);
 KVM_NVHE_ALIAS(kvm_patch_vector_branch);
 KVM_NVHE_ALIAS(kvm_update_va_mask);
 
 /* Global kernel state accessed by nVHE hyp code. */
-KVM_NVHE_ALIAS(arm64_ssbd_callback_required);
 KVM_NVHE_ALIAS(kvm_host_data);
 KVM_NVHE_ALIAS(kvm_vgic_global_state);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46dc3d75cf13..b1bd2f4cd3e0 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1549,10 +1549,6 @@ static int init_hyp_mode(void)
 		}
 	}
 
-	err = hyp_map_aux_data();
-	if (err)
-		kvm_err("Cannot map host auxiliary data: %d\n", err);
-
 	return 0;
 
 out_err:
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index db2dd7500617..3cca756e9e41 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -116,33 +116,6 @@ el1_hvc_guest:
 			  ARM_SMCCC_ARCH_WORKAROUND_2)
 	cbnz	w1, el1_trap
 
-alternative_cb	arm64_enable_wa2_handling
-	b	wa2_end
-alternative_cb_end
-	get_vcpu_ptr	x2, x0
-	ldr	x0, [x2, #VCPU_WORKAROUND_FLAGS]
-
-	// Sanitize the argument and update the guest flags
-	ldr	x1, [sp, #8]			// Guest's x1
-	clz	w1, w1				// Murphy's device:
-	lsr	w1, w1, #5			// w1 = !!w1 without using
-	eor	w1, w1, #1			// the flags...
-	bfi	x0, x1, #VCPU_WORKAROUND_2_FLAG_SHIFT, #1
-	str	x0, [x2, #VCPU_WORKAROUND_FLAGS]
-
-	/* Check that we actually need to perform the call */
-	hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2
-	cbz	x0, wa2_end
-
-	mov	w0, #ARM_SMCCC_ARCH_WORKAROUND_2
-	smc	#0
-
-	/* Don't leak data from the SMC call */
-	mov	x3, xzr
-wa2_end:
-	mov	x2, xzr
-	mov	x1, xzr
-
 wa_epilogue:
 	mov	x0, xzr
 	add	sp, sp, #16
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 5dec5589625d..a6840823b60e 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -479,35 +479,6 @@ exit:
 	return false;
 }
 
-static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu)
-{
-	if (!cpus_have_final_cap(ARM64_SPECTRE_V4))
-		return false;
-
-	return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG);
-}
-
-static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * The host runs with the workaround always present. If the
-	 * guest wants it disabled, so be it...
-	 */
-	if (__needs_ssbd_off(vcpu) &&
-	    __hyp_this_cpu_read(arm64_ssbd_callback_required))
-		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL);
-}
-
-static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
-{
-	/*
-	 * If the guest has disabled the workaround, bring it back on.
-	 */
-	if (__needs_ssbd_off(vcpu) &&
-	    __hyp_this_cpu_read(arm64_ssbd_callback_required))
-		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL);
-}
-
 static inline void __kvm_unexpected_el2_exception(void)
 {
 	unsigned long addr, fixup;
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 0970442d2dbc..8d3dd4f47924 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -202,8 +202,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	__debug_switch_to_guest(vcpu);
 
-	__set_guest_arch_workaround_state(vcpu);
-
 	do {
 		/* Jump in the fire! */
 		exit_code = __guest_enter(vcpu, host_ctxt);
@@ -211,8 +209,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 		/* And we're baaack! */
 	} while (fixup_guest_exit(vcpu, &exit_code));
 
-	__set_host_arch_workaround_state(vcpu);
-
 	__sysreg_save_state_nvhe(guest_ctxt);
 	__sysreg32_save_state(vcpu);
 	__timer_disable_traps(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index c1da4f86ccac..ecf67e678203 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -131,8 +131,6 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 	sysreg_restore_guest_state_vhe(guest_ctxt);
 	__debug_switch_to_guest(vcpu);
 
-	__set_guest_arch_workaround_state(vcpu);
-
 	do {
 		/* Jump in the fire! */
 		exit_code = __guest_enter(vcpu, host_ctxt);
@@ -140,8 +138,6 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 		/* And we're baaack! */
 	} while (fixup_guest_exit(vcpu, &exit_code));
 
-	__set_host_arch_workaround_state(vcpu);
-
 	sysreg_save_guest_state_vhe(guest_ctxt);
 
 	__deactivate_traps(vcpu);
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 413d46b9bc07..69e023dfafce 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -36,15 +36,13 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 			}
 			break;
 		case ARM_SMCCC_ARCH_WORKAROUND_2:
-			switch (kvm_arm_have_ssbd()) {
-			case KVM_SSBD_FORCE_DISABLE:
-			case KVM_SSBD_UNKNOWN:
+			switch (arm64_get_ssbd_state()) {
+			case ARM64_SSBD_FORCE_DISABLE:
+			case ARM64_SSBD_UNKNOWN:
 				break;
-			case KVM_SSBD_KERNEL:
-				val = SMCCC_RET_SUCCESS;
-				break;
-			case KVM_SSBD_FORCE_ENABLE:
-			case KVM_SSBD_MITIGATED:
+			case ARM64_SSBD_KERNEL:
+			case ARM64_SSBD_FORCE_ENABLE:
+			case ARM64_SSBD_MITIGATED:
 				val = SMCCC_RET_NOT_REQUIRED;
 				break;
 			}
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index fbdd6f3bea7f..87e6e3818fb5 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -435,17 +435,15 @@ static int get_kernel_wa_level(u64 regid)
 		}
 		return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
 	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-		switch (kvm_arm_have_ssbd()) {
-		case KVM_SSBD_FORCE_DISABLE:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
-		case KVM_SSBD_KERNEL:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL;
-		case KVM_SSBD_FORCE_ENABLE:
-		case KVM_SSBD_MITIGATED:
+		switch (arm64_get_ssbd_state()) {
+		case ARM64_SSBD_FORCE_ENABLE:
+		case ARM64_SSBD_MITIGATED:
+		case ARM64_SSBD_KERNEL:
 			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
-		case KVM_SSBD_UNKNOWN:
+		case ARM64_SSBD_UNKNOWN:
+		case ARM64_SSBD_FORCE_DISABLE:
 		default:
-			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN;
+			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
 		}
 	}
 
@@ -462,14 +460,8 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 		val = kvm_psci_version(vcpu, vcpu->kvm);
 		break;
 	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
-		val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-		break;
 	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
 		val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
-
-		if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
-		    kvm_arm_get_vcpu_workaround_2_flag(vcpu))
-			val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED;
 		break;
 	default:
 		return -ENOENT;
@@ -527,34 +519,35 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 			    KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
 			return -EINVAL;
 
-		wa_level = val & KVM_REG_FEATURE_LEVEL_MASK;
-
-		if (get_kernel_wa_level(reg->id) < wa_level)
-			return -EINVAL;
-
 		/* The enabled bit must not be set unless the level is AVAIL. */
-		if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL &&
-		    wa_level != val)
+		if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) &&
+		    (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL)
 			return -EINVAL;
 
-		/* Are we finished or do we need to check the enable bit ? */
-		if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL)
-			return 0;
-
 		/*
-		 * If this kernel supports the workaround to be switched on
-		 * or off, make sure it matches the requested setting.
+		 * Map all the possible incoming states to the only two we
+		 * really want to deal with.
 		 */
-		switch (wa_level) {
+		switch (val & KVM_REG_FEATURE_LEVEL_MASK) {
+		case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
+		case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
+			wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+			break;
 		case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
-			kvm_arm_set_vcpu_workaround_2_flag(vcpu,
-			    val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED);
-			break;
 		case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
-			kvm_arm_set_vcpu_workaround_2_flag(vcpu, true);
+			wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
 			break;
+		default:
+			return -EINVAL;
 		}
 
+		/*
+		 * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
+		 * other way around.
+		 */
+		if (get_kernel_wa_level(reg->id) < wa_level)
+			return -EINVAL;
+
 		return 0;
 	default:
 		return -ENOENT;
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index ee33875c5c2a..f6e8b4a75cbb 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -319,10 +319,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 		vcpu->arch.reset_state.reset = false;
 	}
 
-	/* Default workaround setup is enabled (if supported) */
-	if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL)
-		vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG;
-
 	/* Reset timer */
 	ret = kvm_timer_vcpu_reset(vcpu);
 out:

From 7311467702710cc30ac4e3a6c6670a766e7667f9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Fri, 18 Sep 2020 13:59:32 +0100
Subject: [PATCH 69/87] KVM: arm64: Get rid of kvm_arm_have_ssbd()

kvm_arm_have_ssbd() is now completely unused, get rid of it.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 59f6cff0f0cb..8785ebf9bc29 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -631,29 +631,6 @@ static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
 static inline void kvm_clr_pmu_events(u32 clr) {}
 #endif
 
-#define KVM_SSBD_UNKNOWN		-1
-#define KVM_SSBD_FORCE_DISABLE		0
-#define KVM_SSBD_KERNEL		1
-#define KVM_SSBD_FORCE_ENABLE		2
-#define KVM_SSBD_MITIGATED		3
-
-static inline int kvm_arm_have_ssbd(void)
-{
-	switch (arm64_get_ssbd_state()) {
-	case ARM64_SSBD_FORCE_DISABLE:
-		return KVM_SSBD_FORCE_DISABLE;
-	case ARM64_SSBD_KERNEL:
-		return KVM_SSBD_KERNEL;
-	case ARM64_SSBD_FORCE_ENABLE:
-		return KVM_SSBD_FORCE_ENABLE;
-	case ARM64_SSBD_MITIGATED:
-		return KVM_SSBD_MITIGATED;
-	case ARM64_SSBD_UNKNOWN:
-	default:
-		return KVM_SSBD_UNKNOWN;
-	}
-}
-
 void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu);
 void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu);
 

From d63d975a71b332df36cc802e6e77a462af6b9fef Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Fri, 18 Sep 2020 14:08:54 +0100
Subject: [PATCH 70/87] KVM: arm64: Convert ARCH_WORKAROUND_2 to
 arm64_get_spectre_v4_state()

Convert the KVM WA2 code to using the Spectre infrastructure,
making the code much more readable. It also allows us to
take SSBS into account for the mitigation.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kvm/arm.c        |  2 +-
 arch/arm64/kvm/hypercalls.c | 23 +++++++++++++++++------
 arch/arm64/kvm/psci.c       | 19 ++++++++++++-------
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b1bd2f4cd3e0..9b90a701462c 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1292,7 +1292,7 @@ static void cpu_init_hyp_mode(void)
 	 * at EL2.
 	 */
 	if (this_cpu_has_cap(ARM64_SSBS) &&
-	    arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
+	    arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
 		kvm_call_hyp_nvhe(__kvm_enable_ssbs);
 	}
 }
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 69e023dfafce..9824025ccc5c 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -36,13 +36,24 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
 			}
 			break;
 		case ARM_SMCCC_ARCH_WORKAROUND_2:
-			switch (arm64_get_ssbd_state()) {
-			case ARM64_SSBD_FORCE_DISABLE:
-			case ARM64_SSBD_UNKNOWN:
+			switch (arm64_get_spectre_v4_state()) {
+			case SPECTRE_VULNERABLE:
 				break;
-			case ARM64_SSBD_KERNEL:
-			case ARM64_SSBD_FORCE_ENABLE:
-			case ARM64_SSBD_MITIGATED:
+			case SPECTRE_MITIGATED:
+				/*
+				 * SSBS everywhere: Indicate no firmware
+				 * support, as the SSBS support will be
+				 * indicated to the guest and the default is
+				 * safe.
+				 *
+				 * Otherwise, expose a permanent mitigation
+				 * to the guest, and hide SSBS so that the
+				 * guest stays protected.
+				 */
+				if (cpus_have_final_cap(ARM64_SSBS))
+					break;
+				fallthrough;
+			case SPECTRE_UNAFFECTED:
 				val = SMCCC_RET_NOT_REQUIRED;
 				break;
 			}
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 87e6e3818fb5..db4056ecccfd 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -435,14 +435,19 @@ static int get_kernel_wa_level(u64 regid)
 		}
 		return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
 	case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
-		switch (arm64_get_ssbd_state()) {
-		case ARM64_SSBD_FORCE_ENABLE:
-		case ARM64_SSBD_MITIGATED:
-		case ARM64_SSBD_KERNEL:
+		switch (arm64_get_spectre_v4_state()) {
+		case SPECTRE_MITIGATED:
+			/*
+			 * As for the hypercall discovery, we pretend we
+			 * don't have any FW mitigation if SSBS is there at
+			 * all times.
+			 */
+			if (cpus_have_final_cap(ARM64_SSBS))
+				return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
+			fallthrough;
+		case SPECTRE_UNAFFECTED:
 			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
-		case ARM64_SSBD_UNKNOWN:
-		case ARM64_SSBD_FORCE_DISABLE:
-		default:
+		case SPECTRE_VULNERABLE:
 			return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
 		}
 	}

From 31c84d6c9cde673b3866972552ec52e4c522b4fa Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Fri, 18 Sep 2020 14:11:25 +0100
Subject: [PATCH 71/87] arm64: Get rid of arm64_ssbd_state

Out with the old ghost, in with the new...

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/cpufeature.h | 14 --------------
 arch/arm64/kernel/cpu_errata.c      |  2 --
 2 files changed, 16 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 3b48aa121cee..fba6700b457b 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -698,20 +698,6 @@ static inline bool system_supports_tlb_range(void)
 		cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
 }
 
-#define ARM64_SSBD_UNKNOWN		-1
-#define ARM64_SSBD_FORCE_DISABLE	0
-#define ARM64_SSBD_KERNEL		1
-#define ARM64_SSBD_FORCE_ENABLE		2
-#define ARM64_SSBD_MITIGATED		3
-
-static inline int arm64_get_ssbd_state(void)
-{
-	extern int ssbd_state;
-	return ssbd_state;
-}
-
-void arm64_set_ssbd_mitigation(bool state);
-
 extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
 
 static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 7e9caef13db4..6c8303559beb 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -106,8 +106,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap)
 		sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
 }
 
-int ssbd_state __read_mostly = ARM64_SSBD_UNKNOWN;
-
 #ifdef CONFIG_ARM64_ERRATUM_1463225
 DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
 

From 9ef2b48be9bba70ded9372fc81e678e707306060 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 28 Sep 2020 11:45:24 +0100
Subject: [PATCH 72/87] KVM: arm64: Allow patching EL2 vectors even with KASLR
 is not enabled

Patching the EL2 exception vectors is integral to the Spectre-v2
workaround, where it can be necessary to execute CPU-specific sequences
to nobble the branch predictor before running the hypervisor text proper.

Remove the dependency on CONFIG_RANDOMIZE_BASE and allow the EL2 vectors
to be patched even when KASLR is not enabled.

Fixes: 7a132017e7a5 ("KVM: arm64: Replace CONFIG_KVM_INDIRECT_VECTORS with CONFIG_RANDOMIZE_BASE")
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/r/202009221053.Jv1XsQUZ%lkp@intel.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kvm_asm.h |  2 --
 arch/arm64/include/asm/kvm_mmu.h | 51 +-------------------------------
 arch/arm64/kernel/proton-pack.c  |  2 --
 arch/arm64/kvm/arm.c             | 34 +++++++++++++++++++++
 arch/arm64/kvm/hyp/Makefile      |  3 +-
 arch/arm64/kvm/hyp/hyp-entry.S   |  2 --
 6 files changed, 36 insertions(+), 58 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index abe02cf66880..7f7072f6cb45 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -99,11 +99,9 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
 #define __kvm_hyp_init		CHOOSE_NVHE_SYM(__kvm_hyp_init)
 #define __kvm_hyp_vector	CHOOSE_HYP_SYM(__kvm_hyp_vector)
 
-#ifdef CONFIG_RANDOMIZE_BASE
 extern atomic_t arm64_el2_vector_last_slot;
 DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
 #define __bp_harden_hyp_vecs	CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
-#endif
 
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 36606ef9e435..cff1cebc7590 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -9,6 +9,7 @@
 
 #include <asm/page.h>
 #include <asm/memory.h>
+#include <asm/mmu.h>
 #include <asm/cpufeature.h>
 
 /*
@@ -430,7 +431,6 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
 	return ret;
 }
 
-#ifdef CONFIG_RANDOMIZE_BASE
 /*
  * EL2 vectors can be mapped and rerouted in a number of ways,
  * depending on the kernel configuration and CPU present:
@@ -451,12 +451,9 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
  * VHE, as we don't have hypervisor-specific mappings. If the system
  * is VHE and yet selects this capability, it will be ignored.
  */
-#include <asm/mmu.h>
-
 extern void *__kvm_bp_vect_base;
 extern int __kvm_harden_el2_vector_slot;
 
-/*  This is called on both VHE and !VHE systems */
 static inline void *kvm_get_hyp_vector(void)
 {
 	struct bp_hardening_data *data = arm64_get_bp_hardening_data();
@@ -480,52 +477,6 @@ static inline void *kvm_get_hyp_vector(void)
 	return vect;
 }
 
-/*  This is only called on a !VHE system */
-static inline int kvm_map_vectors(void)
-{
-	/*
-	 * SV2  = ARM64_SPECTRE_V2
-	 * HEL2 = ARM64_HARDEN_EL2_VECTORS
-	 *
-	 * !SV2 + !HEL2 -> use direct vectors
-	 *  SV2 + !HEL2 -> use hardened vectors in place
-	 * !SV2 +  HEL2 -> allocate one vector slot and use exec mapping
-	 *  SV2 +  HEL2 -> use hardened vertors and use exec mapping
-	 */
-	if (cpus_have_const_cap(ARM64_SPECTRE_V2)) {
-		__kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
-		__kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
-	}
-
-	if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
-		phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
-		unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
-
-		/*
-		 * Always allocate a spare vector slot, as we don't
-		 * know yet which CPUs have a BP hardening slot that
-		 * we can reuse.
-		 */
-		__kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
-		BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
-		return create_hyp_exec_mappings(vect_pa, size,
-						&__kvm_bp_vect_base);
-	}
-
-	return 0;
-}
-#else
-static inline void *kvm_get_hyp_vector(void)
-{
-	return kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
-}
-
-static inline int kvm_map_vectors(void)
-{
-	return 0;
-}
-#endif
-
 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
 
 /*
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index b1ea935fd948..1fbaa0240d4c 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -177,7 +177,6 @@ enum mitigation_state arm64_get_spectre_v2_state(void)
 }
 
 #ifdef CONFIG_KVM
-#ifdef CONFIG_RANDOMIZE_BASE
 #include <asm/cacheflush.h>
 #include <asm/kvm_asm.h>
 
@@ -235,7 +234,6 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn)
 {
 	__this_cpu_write(bp_hardening_data.fn, fn);
 }
-#endif	/* CONFIG_RANDOMIZE_BASE */
 #endif	/* CONFIG_KVM */
 
 static void call_smc_arch_workaround_1(void)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9b90a701462c..13e559ac7235 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1256,6 +1256,40 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 }
 
+static int kvm_map_vectors(void)
+{
+	/*
+	 * SV2  = ARM64_SPECTRE_V2
+	 * HEL2 = ARM64_HARDEN_EL2_VECTORS
+	 *
+	 * !SV2 + !HEL2 -> use direct vectors
+	 *  SV2 + !HEL2 -> use hardened vectors in place
+	 * !SV2 +  HEL2 -> allocate one vector slot and use exec mapping
+	 *  SV2 +  HEL2 -> use hardened vectors and use exec mapping
+	 */
+	if (cpus_have_const_cap(ARM64_SPECTRE_V2)) {
+		__kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs);
+		__kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base);
+	}
+
+	if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) {
+		phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs);
+		unsigned long size = __BP_HARDEN_HYP_VECS_SZ;
+
+		/*
+		 * Always allocate a spare vector slot, as we don't
+		 * know yet which CPUs have a BP hardening slot that
+		 * we can reuse.
+		 */
+		__kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot);
+		BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS);
+		return create_hyp_exec_mappings(vect_pa, size,
+						&__kvm_bp_vect_base);
+	}
+
+	return 0;
+}
+
 static void cpu_init_hyp_mode(void)
 {
 	phys_addr_t pgd_ptr;
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 89d2bf73acb5..d898f0da5802 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -10,5 +10,4 @@ subdir-ccflags-y := -I$(incdir)				\
 		    -DDISABLE_BRANCH_PROFILING		\
 		    $(DISABLE_STACKLEAK_PLUGIN)
 
-obj-$(CONFIG_KVM) += vhe/ nvhe/
-obj-$(CONFIG_RANDOMIZE_BASE) += smccc_wa.o
+obj-$(CONFIG_KVM) += vhe/ nvhe/ smccc_wa.o
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 3cca756e9e41..7ea277b82967 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -259,7 +259,6 @@ SYM_CODE_START(__kvm_hyp_vector)
 	valid_vect	el1_error		// Error 32-bit EL1
 SYM_CODE_END(__kvm_hyp_vector)
 
-#ifdef CONFIG_RANDOMIZE_BASE
 .macro hyp_ventry
 	.align 7
 1:	esb
@@ -309,4 +308,3 @@ SYM_CODE_START(__bp_harden_hyp_vecs)
 1:	.org __bp_harden_hyp_vecs + __BP_HARDEN_HYP_VECS_SZ
 	.org 1b
 SYM_CODE_END(__bp_harden_hyp_vecs)
-#endif

From 5c8b0cbd9d6bac5f40943b5a7d8eac8cb86cbe7f Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 28 Sep 2020 14:06:50 +0100
Subject: [PATCH 73/87] arm64: Pull in task_stack_page() to Spectre-v4
 mitigation code

The kbuild robot reports that we're relying on an implicit inclusion to
get a definition of task_stack_page() in the Spectre-v4 mitigation code,
which is not always in place for some configurations:

  | arch/arm64/kernel/proton-pack.c:329:2: error: implicit declaration of function 'task_stack_page' [-Werror,-Wimplicit-function-declaration]
  |         task_pt_regs(task)->pstate |= val;
  |         ^
  | arch/arm64/include/asm/processor.h:268:36: note: expanded from macro 'task_pt_regs'
  |         ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1)
  |                                           ^
  | arch/arm64/kernel/proton-pack.c:329:2: note: did you mean 'task_spread_page'?

Add the missing include to fix the build error.

Fixes: a44acf477220 ("arm64: Move SSBD prctl() handler alongside other spectre mitigation code")
Reported-by: Anthony Steinhauser <asteinhauser@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/r/202009260013.Ul7AD29w%lkp@intel.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/proton-pack.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 1fbaa0240d4c..16c9a095a746 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/nospec.h>
 #include <linux/prctl.h>
+#include <linux/sched/task_stack.h>
 
 #include <asm/spectre.h>
 #include <asm/traps.h>

From 780c083a8f840ca9162c7a4090ff5e10d15152a2 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 28 Sep 2020 14:03:00 +0100
Subject: [PATCH 74/87] arm64: Add support for PR_SPEC_DISABLE_NOEXEC prctl()
 option

The PR_SPEC_DISABLE_NOEXEC option to the PR_SPEC_STORE_BYPASS prctl()
allows the SSB mitigation to be enabled only until the next execve(),
at which point the state will revert back to PR_SPEC_ENABLE and the
mitigation will be disabled.

Add support for PR_SPEC_DISABLE_NOEXEC on arm64.

Reported-by: Anthony Steinhauser <asteinhauser@google.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/process.c     |  6 ++++++
 arch/arm64/kernel/proton-pack.c | 38 +++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 9dbd35b95253..085d8ca39e47 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -21,6 +21,7 @@
 #include <linux/lockdep.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/nospec.h>
 #include <linux/stddef.h>
 #include <linux/sysctl.h>
 #include <linux/unistd.h>
@@ -609,6 +610,11 @@ void arch_setup_new_exec(void)
 	current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0;
 
 	ptrauth_thread_init_user(current);
+
+	if (task_spec_ssb_noexec(current)) {
+		arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS,
+					 PR_SPEC_ENABLE);
+	}
 }
 
 #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 16c9a095a746..68b710f1b43f 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -660,6 +660,20 @@ void spectre_v4_enable_task_mitigation(struct task_struct *tsk)
  * prctl() may be necessary even when PSTATE.SSBS can be toggled directly
  * from userspace.
  */
+static void ssbd_prctl_enable_mitigation(struct task_struct *task)
+{
+	task_clear_spec_ssb_noexec(task);
+	task_set_spec_ssb_disable(task);
+	set_tsk_thread_flag(task, TIF_SSBD);
+}
+
+static void ssbd_prctl_disable_mitigation(struct task_struct *task)
+{
+	task_clear_spec_ssb_noexec(task);
+	task_clear_spec_ssb_disable(task);
+	clear_tsk_thread_flag(task, TIF_SSBD);
+}
+
 static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
 {
 	switch (ctrl) {
@@ -679,8 +693,7 @@ static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
 		if (spectre_v4_mitigations_on())
 			return -EPERM;
 
-		task_clear_spec_ssb_disable(task);
-		clear_tsk_thread_flag(task, TIF_SSBD);
+		ssbd_prctl_disable_mitigation(task);
 		break;
 	case PR_SPEC_FORCE_DISABLE:
 		/* Force disable speculation: force enable mitigation */
@@ -699,8 +712,22 @@ static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
 		if (spectre_v4_mitigations_off())
 			return -EPERM;
 
-		task_set_spec_ssb_disable(task);
-		set_tsk_thread_flag(task, TIF_SSBD);
+		ssbd_prctl_enable_mitigation(task);
+		break;
+	case PR_SPEC_DISABLE_NOEXEC:
+		/* Disable speculation until execve(): enable mitigation */
+		/*
+		 * If the mitigation state is forced one way or the other, then
+		 * we must fail now before we try to toggle it on execve().
+		 */
+		if (task_spec_ssb_force_disable(task) ||
+		    spectre_v4_mitigations_off() ||
+		    spectre_v4_mitigations_on()) {
+			return -EPERM;
+		}
+
+		ssbd_prctl_enable_mitigation(task);
+		task_set_spec_ssb_noexec(task);
 		break;
 	default:
 		return -ERANGE;
@@ -745,6 +772,9 @@ static int ssbd_prctl_get(struct task_struct *task)
 	if (task_spec_ssb_force_disable(task))
 		return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
 
+	if (task_spec_ssb_noexec(task))
+		return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
+
 	if (task_spec_ssb_disable(task))
 		return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
 

From ab25464bdabd45f283cc1194e332040f89071106 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:01 +0100
Subject: [PATCH 75/87] kvm: arm64: Partially link nVHE hyp code, simplify
 HYPCOPY

Relying on objcopy to prefix the ELF section names of the nVHE hyp code
is brittle and prevents us from using wildcards to match specific
section names.

Improve the build rules by partially linking all '.nvhe.o' files and
prefixing their ELF section names using a linker script. Continue using
objcopy for prefixing ELF symbol names.

One immediate advantage of this approach is that all subsections
matching a pattern can be merged into a single prefixed section, eg.
.text and .text.* can be linked into a single '.hyp.text'. This removes
the need for -fno-reorder-functions on GCC and will be useful in the
future too: LTO builds use .text subsections, compilers routinely
generate .rodata subsections, etc.

Partially linking all hyp code into a single object file also makes it
easier to analyze.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-2-dbrazdil@google.com
---
 arch/arm64/include/asm/hyp_image.h | 24 ++++++++++++
 arch/arm64/kvm/hyp/nvhe/.gitignore |  2 +
 arch/arm64/kvm/hyp/nvhe/Makefile   | 60 ++++++++++++++++--------------
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S  | 13 +++++++
 4 files changed, 72 insertions(+), 27 deletions(-)
 create mode 100644 arch/arm64/include/asm/hyp_image.h
 create mode 100644 arch/arm64/kvm/hyp/nvhe/.gitignore
 create mode 100644 arch/arm64/kvm/hyp/nvhe/hyp.lds.S

diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h
new file mode 100644
index 000000000000..2e38fcda02fc
--- /dev/null
+++ b/arch/arm64/include/asm/hyp_image.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ * Written by David Brazdil <dbrazdil@google.com>
+ */
+
+#ifndef __ARM64_HYP_IMAGE_H__
+#define __ARM64_HYP_IMAGE_H__
+
+#ifdef LINKER_SCRIPT
+
+/*
+ * KVM nVHE ELF section names are prefixed with .hyp, to separate them
+ * from the kernel proper.
+ */
+#define HYP_SECTION_NAME(NAME)	.hyp##NAME
+
+/* Defines an ELF hyp section from input section @NAME and its subsections. */
+#define HYP_SECTION(NAME) \
+	HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) }
+
+#endif /* LINKER_SCRIPT */
+
+#endif /* __ARM64_HYP_IMAGE_H__ */
diff --git a/arch/arm64/kvm/hyp/nvhe/.gitignore b/arch/arm64/kvm/hyp/nvhe/.gitignore
new file mode 100644
index 000000000000..695d73d0249e
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+hyp.lds
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index aef76487edc2..2b27b60182f9 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -10,40 +10,46 @@ obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o
 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
 	 ../fpsimd.o ../hyp-entry.o
 
-obj-y := $(patsubst %.o,%.hyp.o,$(obj-y))
-extra-y := $(patsubst %.hyp.o,%.hyp.tmp.o,$(obj-y))
+##
+## Build rules for compiling nVHE hyp code
+## Output of this folder is `kvm_nvhe.o`, a partially linked object
+## file containing all nVHE hyp code and data.
+##
 
-$(obj)/%.hyp.tmp.o: $(src)/%.c FORCE
+hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y))
+obj-y := kvm_nvhe.o
+extra-y := $(hyp-obj) kvm_nvhe.tmp.o hyp.lds
+
+# 1) Compile all source files to `.nvhe.o` object files. The file extension
+#    avoids file name clashes for files shared with VHE.
+$(obj)/%.nvhe.o: $(src)/%.c FORCE
 	$(call if_changed_rule,cc_o_c)
-$(obj)/%.hyp.tmp.o: $(src)/%.S FORCE
+$(obj)/%.nvhe.o: $(src)/%.S FORCE
 	$(call if_changed_rule,as_o_S)
-$(obj)/%.hyp.o: $(obj)/%.hyp.tmp.o FORCE
+
+# 2) Compile linker script.
+$(obj)/hyp.lds: $(src)/hyp.lds.S FORCE
+	$(call if_changed_dep,cpp_lds_S)
+
+# 3) Partially link all '.nvhe.o' files and apply the linker script.
+#    Prefixes names of ELF sections with '.hyp', eg. '.hyp.text'.
+#    Note: The following rule assumes that the 'ld' rule puts LDFLAGS before
+#          the list of dependencies to form '-T $(obj)/hyp.lds'. This is to
+#          keep the dependency on the target while avoiding an error from
+#          GNU ld if the linker script is passed to it twice.
+LDFLAGS_kvm_nvhe.tmp.o := -r -T
+$(obj)/kvm_nvhe.tmp.o: $(obj)/hyp.lds $(addprefix $(obj)/,$(hyp-obj)) FORCE
+	$(call if_changed,ld)
+
+# 4) Produce the final 'kvm_nvhe.o', ready to be linked into 'vmlinux'.
+#    Prefixes names of ELF symbols with '__kvm_nvhe_'.
+$(obj)/kvm_nvhe.o: $(obj)/kvm_nvhe.tmp.o FORCE
 	$(call if_changed,hypcopy)
 
-# Disable reordering functions by GCC (enabled at -O2).
-# This pass puts functions into '.text.*' sections to aid the linker
-# in optimizing ELF layout. See HYPCOPY comment below for more info.
-ccflags-y += $(call cc-option,-fno-reorder-functions)
-
 # The HYPCOPY command uses `objcopy` to prefix all ELF symbol names
-# and relevant ELF section names to avoid clashes with VHE code/data.
-#
-# Hyp code is assumed to be in the '.text' section of the input object
-# files (with the exception of specialized sections such as
-# '.hyp.idmap.text'). This assumption may be broken by a compiler that
-# divides code into sections like '.text.unlikely' so as to optimize
-# ELF layout. HYPCOPY checks that no such sections exist in the input
-# using `objdump`, otherwise they would be linked together with other
-# kernel code and not memory-mapped correctly at runtime.
+# to avoid clashes with VHE code/data.
 quiet_cmd_hypcopy = HYPCOPY $@
-      cmd_hypcopy =							\
-	if $(OBJDUMP) -h $< | grep -F '.text.'; then			\
-		echo "$@: function reordering not supported in nVHE hyp code" >&2; \
-		/bin/false;						\
-	fi;								\
-	$(OBJCOPY) --prefix-symbols=__kvm_nvhe_				\
-		   --rename-section=.text=.hyp.text			\
-		   $< $@
+      cmd_hypcopy = $(OBJCOPY) --prefix-symbols=__kvm_nvhe_ $< $@
 
 # Remove ftrace and Shadow Call Stack CFLAGS.
 # This is equivalent to the 'notrace' and '__noscs' annotations.
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
new file mode 100644
index 000000000000..3b13d1c7cd1a
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020 Google LLC.
+ * Written by David Brazdil <dbrazdil@google.com>
+ *
+ * Linker script used for partial linking of nVHE EL2 object files.
+ */
+
+#include <asm/hyp_image.h>
+
+SECTIONS {
+	HYP_SECTION(.text)
+}

From ce492a16ffb8814d9651c3fdafc363bfa1b01189 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:02 +0100
Subject: [PATCH 76/87] kvm: arm64: Move nVHE hyp namespace macros to
 hyp_image.h

Minor cleanup to move all macros related to prefixing nVHE hyp section
and symbol names into one place: hyp_image.h.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-3-dbrazdil@google.com
---
 arch/arm64/include/asm/hyp_image.h | 12 ++++++++++++
 arch/arm64/include/asm/kvm_asm.h   |  8 +-------
 arch/arm64/kernel/image-vars.h     |  2 --
 arch/arm64/kernel/vmlinux.lds.S    |  1 +
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/hyp_image.h b/arch/arm64/include/asm/hyp_image.h
index 2e38fcda02fc..daa1a1da539e 100644
--- a/arch/arm64/include/asm/hyp_image.h
+++ b/arch/arm64/include/asm/hyp_image.h
@@ -7,6 +7,12 @@
 #ifndef __ARM64_HYP_IMAGE_H__
 #define __ARM64_HYP_IMAGE_H__
 
+/*
+ * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_,
+ * to separate it from the kernel proper.
+ */
+#define kvm_nvhe_sym(sym)	__kvm_nvhe_##sym
+
 #ifdef LINKER_SCRIPT
 
 /*
@@ -19,6 +25,12 @@
 #define HYP_SECTION(NAME) \
 	HYP_SECTION_NAME(NAME) : { *(NAME NAME##.*) }
 
+/*
+ * Defines a linker script alias of a kernel-proper symbol referenced by
+ * KVM nVHE hyp code.
+ */
+#define KVM_NVHE_ALIAS(sym)	kvm_nvhe_sym(sym) = sym;
+
 #endif /* LINKER_SCRIPT */
 
 #endif /* __ARM64_HYP_IMAGE_H__ */
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 6f98fbd0ac81..c085032e2e3e 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -7,6 +7,7 @@
 #ifndef __ARM_KVM_ASM_H__
 #define __ARM_KVM_ASM_H__
 
+#include <asm/hyp_image.h>
 #include <asm/virt.h>
 
 #define	VCPU_WORKAROUND_2_FLAG_SHIFT	0
@@ -42,13 +43,6 @@
 
 #include <linux/mm.h>
 
-/*
- * Translate name of a symbol defined in nVHE hyp to the name seen
- * by kernel proper. All nVHE symbols are prefixed by the build system
- * to avoid clashes with the VHE variants.
- */
-#define kvm_nvhe_sym(sym)	__kvm_nvhe_##sym
-
 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
 #define DECLARE_KVM_NVHE_SYM(sym)	extern char kvm_nvhe_sym(sym)[]
 
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 8982b68289b7..76da2ad1010c 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -61,8 +61,6 @@ __efistub__ctype		= _ctype;
  * memory mappings.
  */
 
-#define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym;
-
 /* Alternative callbacks for init-time patching of nVHE hyp code. */
 KVM_NVHE_ALIAS(arm64_enable_wa2_handling);
 KVM_NVHE_ALIAS(kvm_patch_vector_branch);
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 7cba7623fcec..fbb13f38d0c5 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -9,6 +9,7 @@
 
 #include <asm-generic/vmlinux.lds.h>
 #include <asm/cache.h>
+#include <asm/hyp_image.h>
 #include <asm/kernel-pgtable.h>
 #include <asm/memory.h>
 #include <asm/page.h>

From 3471ee06e33e413d7fa73c1aa3092e6e794b9e05 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:03 +0100
Subject: [PATCH 77/87] kvm: arm64: Only define __kvm_ex_table for CONFIG_KVM

Minor cleanup that only creates __kvm_ex_table ELF section and
related symbols if CONFIG_KVM is enabled. Also useful as more
hyp-specific sections will be added.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-4-dbrazdil@google.com
---
 arch/arm64/kernel/vmlinux.lds.S | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index fbb13f38d0c5..d14166012e51 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -22,11 +22,15 @@ ENTRY(_text)
 jiffies = jiffies_64;
 
 
+#ifdef CONFIG_KVM
 #define HYPERVISOR_EXTABLE					\
 	. = ALIGN(SZ_8);					\
 	__start___kvm_ex_table = .;				\
 	*(__kvm_ex_table)					\
 	__stop___kvm_ex_table = .;
+#else /* CONFIG_KVM */
+#define HYPERVISOR_EXTABLE
+#endif
 
 #define HYPERVISOR_TEXT					\
 	/*						\

From 717cf94adb54095d14a6674baea73123188f2901 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:04 +0100
Subject: [PATCH 78/87] kvm: arm64: Remove __hyp_this_cpu_read

this_cpu_ptr is meant for use in kernel proper because it selects between
TPIDR_EL1/2 based on nVHE/VHE. __hyp_this_cpu_ptr was used in hyp to always
select TPIDR_EL2. Unify all users behind this_cpu_ptr and friends by
selecting _EL2 register under __KVM_NVHE_HYPERVISOR__. VHE continues
selecting the register using alternatives.

Under CONFIG_DEBUG_PREEMPT, the kernel helpers perform a preemption check
which is omitted by the hyp helpers. Preserve the behavior for nVHE by
overriding the corresponding macros under __KVM_NVHE_HYPERVISOR__. Extend
the checks into VHE hyp code.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Andrew Scull <ascull@google.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-5-dbrazdil@google.com
---
 arch/arm64/include/asm/kvm_asm.h          | 20 ----------------
 arch/arm64/include/asm/percpu.h           | 28 +++++++++++++++++++++--
 arch/arm64/kvm/hyp/include/hyp/debug-sr.h |  4 ++--
 arch/arm64/kvm/hyp/include/hyp/switch.h   |  8 +++----
 arch/arm64/kvm/hyp/nvhe/switch.c          |  2 +-
 arch/arm64/kvm/hyp/vhe/switch.c           |  2 +-
 arch/arm64/kvm/hyp/vhe/sysreg-sr.c        |  4 ++--
 7 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index c085032e2e3e..c196eec25498 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -143,26 +143,6 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
 		addr;							\
 	})
 
-/*
- * Home-grown __this_cpu_{ptr,read} variants that always work at HYP,
- * provided that sym is really a *symbol* and not a pointer obtained from
- * a data structure. As for SHIFT_PERCPU_PTR(), the creative casting keeps
- * sparse quiet.
- */
-#define __hyp_this_cpu_ptr(sym)						\
-	({								\
-		void *__ptr;						\
-		__verify_pcpu_ptr(&sym);				\
-		__ptr = hyp_symbol_addr(sym);				\
-		__ptr += read_sysreg(tpidr_el2);			\
-		(typeof(sym) __kernel __force *)__ptr;			\
-	 })
-
-#define __hyp_this_cpu_read(sym)					\
-	({								\
-		*__hyp_this_cpu_ptr(sym);				\
-	 })
-
 #define __KVM_EXTABLE(from, to)						\
 	"	.pushsection	__kvm_ex_table, \"a\"\n"		\
 	"	.align		3\n"					\
diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h
index 0b6409b89e5e..1599e17379d8 100644
--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -19,7 +19,16 @@ static inline void set_my_cpu_offset(unsigned long off)
 			:: "r" (off) : "memory");
 }
 
-static inline unsigned long __my_cpu_offset(void)
+static inline unsigned long __hyp_my_cpu_offset(void)
+{
+	/*
+	 * Non-VHE hyp code runs with preemption disabled. No need to hazard
+	 * the register access against barrier() as in __kern_my_cpu_offset.
+	 */
+	return read_sysreg(tpidr_el2);
+}
+
+static inline unsigned long __kern_my_cpu_offset(void)
 {
 	unsigned long off;
 
@@ -35,7 +44,12 @@ static inline unsigned long __my_cpu_offset(void)
 
 	return off;
 }
-#define __my_cpu_offset __my_cpu_offset()
+
+#ifdef __KVM_NVHE_HYPERVISOR__
+#define __my_cpu_offset __hyp_my_cpu_offset()
+#else
+#define __my_cpu_offset __kern_my_cpu_offset()
+#endif
 
 #define PERCPU_RW_OPS(sz)						\
 static inline unsigned long __percpu_read_##sz(void *ptr)		\
@@ -227,4 +241,14 @@ PERCPU_RET_OP(add, add, ldadd)
 
 #include <asm-generic/percpu.h>
 
+/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */
+#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT)
+#undef	this_cpu_ptr
+#define	this_cpu_ptr		raw_cpu_ptr
+#undef	__this_cpu_read
+#define	__this_cpu_read		raw_cpu_read
+#undef	__this_cpu_write
+#define	__this_cpu_write	raw_cpu_write
+#endif
+
 #endif /* __ASM_PERCPU_H */
diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
index 5e28ea6aa097..4ebe9f558f3a 100644
--- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
@@ -135,7 +135,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
 		return;
 
-	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 	guest_ctxt = &vcpu->arch.ctxt;
 	host_dbg = &vcpu->arch.host_debug_state.regs;
 	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
@@ -154,7 +154,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY))
 		return;
 
-	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 	guest_ctxt = &vcpu->arch.ctxt;
 	host_dbg = &vcpu->arch.host_debug_state.regs;
 	guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr);
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 5b6b8fa00f0a..f150407fa798 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -386,7 +386,7 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu)
 	    !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu)))
 		return false;
 
-	ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 	__ptrauth_save_key(ctxt, APIA);
 	__ptrauth_save_key(ctxt, APIB);
 	__ptrauth_save_key(ctxt, APDA);
@@ -495,7 +495,7 @@ static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu)
 	 * guest wants it disabled, so be it...
 	 */
 	if (__needs_ssbd_off(vcpu) &&
-	    __hyp_this_cpu_read(arm64_ssbd_callback_required))
+	    __this_cpu_read(arm64_ssbd_callback_required))
 		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL);
 #endif
 }
@@ -507,7 +507,7 @@ static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu)
 	 * If the guest has disabled the workaround, bring it back on.
 	 */
 	if (__needs_ssbd_off(vcpu) &&
-	    __hyp_this_cpu_read(arm64_ssbd_callback_required))
+	    __this_cpu_read(arm64_ssbd_callback_required))
 		arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL);
 #endif
 }
@@ -521,7 +521,7 @@ static inline void __kvm_unexpected_el2_exception(void)
 
 	entry = hyp_symbol_addr(__start___kvm_ex_table);
 	end = hyp_symbol_addr(__stop___kvm_ex_table);
-	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 
 	while (entry < end) {
 		addr = (unsigned long)&entry->insn + entry->insn;
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 0970442d2dbc..cc4f8e790fb3 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -175,7 +175,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vcpu = kern_hyp_va(vcpu);
 
-	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 	host_ctxt->__hyp_running_vcpu = vcpu;
 	guest_ctxt = &vcpu->arch.ctxt;
 
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index c1da4f86ccac..575e8054f116 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -108,7 +108,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 	struct kvm_cpu_context *guest_ctxt;
 	u64 exit_code;
 
-	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 	host_ctxt->__hyp_running_vcpu = vcpu;
 	guest_ctxt = &vcpu->arch.ctxt;
 
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 996471e4c138..2a0b8c88d74f 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -66,7 +66,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu)
 	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
 	struct kvm_cpu_context *host_ctxt;
 
-	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 	__sysreg_save_user_state(host_ctxt);
 
 	/*
@@ -100,7 +100,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu)
 	struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
 	struct kvm_cpu_context *host_ctxt;
 
-	host_ctxt = &__hyp_this_cpu_ptr(kvm_host_data)->host_ctxt;
+	host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt;
 	deactivate_traps_vhe_put();
 
 	__sysreg_save_el1_state(guest_ctxt);

From ea391027d35546d9155f1350123b5af8bddec706 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:05 +0100
Subject: [PATCH 79/87] kvm: arm64: Remove hyp_adr/ldr_this_cpu

The hyp_adr/ldr_this_cpu helpers were introduced for use in hyp code
because they always needed to use TPIDR_EL2 for base, while
adr/ldr_this_cpu from kernel proper would select between TPIDR_EL2 and
_EL1 based on VHE/nVHE.

Simplify this now that the hyp mode case can be handled using the
__KVM_VHE/NVHE_HYPERVISOR__ macros.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Andrew Scull <ascull@google.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-6-dbrazdil@google.com
---
 arch/arm64/include/asm/assembler.h | 29 +++++++++++++++++++----------
 arch/arm64/include/asm/kvm_asm.h   | 14 +-------------
 arch/arm64/kvm/hyp/hyp-entry.S     |  2 +-
 3 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 54d181177656..ddbe6bf00e33 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -218,6 +218,23 @@ lr	.req	x30		// link register
 	str	\src, [\tmp, :lo12:\sym]
 	.endm
 
+	/*
+	 * @dst: destination register
+	 */
+#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
+	.macro	this_cpu_offset, dst
+	mrs	\dst, tpidr_el2
+	.endm
+#else
+	.macro	this_cpu_offset, dst
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+	mrs	\dst, tpidr_el1
+alternative_else
+	mrs	\dst, tpidr_el2
+alternative_endif
+	.endm
+#endif
+
 	/*
 	 * @dst: Result of per_cpu(sym, smp_processor_id()) (can be SP)
 	 * @sym: The name of the per-cpu variable
@@ -226,11 +243,7 @@ lr	.req	x30		// link register
 	.macro adr_this_cpu, dst, sym, tmp
 	adrp	\tmp, \sym
 	add	\dst, \tmp, #:lo12:\sym
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
-	mrs	\tmp, tpidr_el1
-alternative_else
-	mrs	\tmp, tpidr_el2
-alternative_endif
+	this_cpu_offset \tmp
 	add	\dst, \dst, \tmp
 	.endm
 
@@ -241,11 +254,7 @@ alternative_endif
 	 */
 	.macro ldr_this_cpu dst, sym, tmp
 	adr_l	\dst, \sym
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
-	mrs	\tmp, tpidr_el1
-alternative_else
-	mrs	\tmp, tpidr_el2
-alternative_endif
+	this_cpu_offset \tmp
 	ldr	\dst, [\dst, \tmp]
 	.endm
 
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index c196eec25498..cf9456663289 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -173,20 +173,8 @@ extern char __smccc_workaround_1_smc[__SMCCC_WORKAROUND_1_SMC_SZ];
 
 #else /* __ASSEMBLY__ */
 
-.macro hyp_adr_this_cpu reg, sym, tmp
-	adr_l	\reg, \sym
-	mrs	\tmp, tpidr_el2
-	add	\reg, \reg, \tmp
-.endm
-
-.macro hyp_ldr_this_cpu reg, sym, tmp
-	adr_l	\reg, \sym
-	mrs	\tmp, tpidr_el2
-	ldr	\reg,  [\reg, \tmp]
-.endm
-
 .macro get_host_ctxt reg, tmp
-	hyp_adr_this_cpu \reg, kvm_host_data, \tmp
+	adr_this_cpu \reg, kvm_host_data, \tmp
 	add	\reg, \reg, #HOST_DATA_CONTEXT
 .endm
 
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 46b4dab933d0..fba91c2ab410 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -132,7 +132,7 @@ alternative_cb_end
 	str	x0, [x2, #VCPU_WORKAROUND_FLAGS]
 
 	/* Check that we actually need to perform the call */
-	hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2
+	ldr_this_cpu x0, arm64_ssbd_callback_required, x2
 	cbz	x0, wa2_end
 
 	mov	w0, #ARM_SMCCC_ARCH_WORKAROUND_2

From 572494995bc3d282336bfd8162741929402910b9 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:06 +0100
Subject: [PATCH 80/87] kvm: arm64: Add helpers for accessing nVHE hyp per-cpu
 vars

Defining a per-CPU variable in hyp/nvhe will result in its name being
prefixed with __kvm_nvhe_. Add helpers for declaring these variables
in kernel proper and accessing them with this_cpu_ptr and per_cpu_ptr.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-7-dbrazdil@google.com
---
 arch/arm64/include/asm/kvm_asm.h | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index cf9456663289..911d91787fa0 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -54,9 +54,21 @@
 	DECLARE_KVM_VHE_SYM(sym);		\
 	DECLARE_KVM_NVHE_SYM(sym)
 
+#define DECLARE_KVM_VHE_PER_CPU(type, sym)	\
+	DECLARE_PER_CPU(type, sym)
+#define DECLARE_KVM_NVHE_PER_CPU(type, sym)	\
+	DECLARE_PER_CPU(type, kvm_nvhe_sym(sym))
+
+#define DECLARE_KVM_HYP_PER_CPU(type, sym)	\
+	DECLARE_KVM_VHE_PER_CPU(type, sym);	\
+	DECLARE_KVM_NVHE_PER_CPU(type, sym)
+
 #define CHOOSE_VHE_SYM(sym)	sym
 #define CHOOSE_NVHE_SYM(sym)	kvm_nvhe_sym(sym)
 
+#define this_cpu_ptr_nvhe_sym(sym)	this_cpu_ptr(&kvm_nvhe_sym(sym))
+#define per_cpu_ptr_nvhe_sym(sym, cpu)	per_cpu_ptr(&kvm_nvhe_sym(sym), cpu)
+
 #ifndef __KVM_NVHE_HYPERVISOR__
 /*
  * BIG FAT WARNINGS:
@@ -69,12 +81,21 @@
  * - Don't let the nVHE hypervisor have access to this, as it will
  *   pick the *wrong* symbol (yes, it runs at EL2...).
  */
-#define CHOOSE_HYP_SYM(sym)	(is_kernel_in_hyp_mode() ? CHOOSE_VHE_SYM(sym) \
+#define CHOOSE_HYP_SYM(sym)		(is_kernel_in_hyp_mode()	\
+					   ? CHOOSE_VHE_SYM(sym)	\
 					   : CHOOSE_NVHE_SYM(sym))
+#define this_cpu_ptr_hyp_sym(sym)	(is_kernel_in_hyp_mode()	\
+					   ? this_cpu_ptr(&sym)		\
+					   : this_cpu_ptr_nvhe_sym(sym))
+#define per_cpu_ptr_hyp_sym(sym, cpu)	(is_kernel_in_hyp_mode()	\
+					   ? per_cpu_ptr(&sym, cpu)	\
+					   : per_cpu_ptr_nvhe_sym(sym, cpu))
 #else
 /* The nVHE hypervisor shouldn't even try to access anything */
 extern void *__nvhe_undefined_symbol;
-#define CHOOSE_HYP_SYM(sym)	__nvhe_undefined_symbol
+#define CHOOSE_HYP_SYM(sym)		__nvhe_undefined_symbol
+#define this_cpu_ptr_hyp_sym(sym)	(&__nvhe_undefined_symbol)
+#define per_cpu_ptr_hyp_sym(sym, cpu)	(&__nvhe_undefined_symbol)
 #endif
 
 /* Translate a kernel address @ptr into its equivalent linear mapping */

From df4c8214a18d202fa0ec221a001f640e020f7e44 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:07 +0100
Subject: [PATCH 81/87] kvm: arm64: Duplicate arm64_ssbd_callback_required for
 nVHE hyp

Hyp keeps track of which cores require SSBD callback by accessing a
kernel-proper global variable. Create an nVHE symbol of the same name
and copy the value from kernel proper to nVHE as KVM is being enabled
on a core.

Done in preparation for separating percpu memory owned by kernel
proper and nVHE.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-8-dbrazdil@google.com
---
 arch/arm64/include/asm/kvm_mmu.h | 14 +++++++++++++-
 arch/arm64/kernel/image-vars.h   |  1 -
 arch/arm64/kvm/arm.c             |  3 +++
 arch/arm64/kvm/hyp/nvhe/switch.c |  3 +++
 4 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 189839c3706a..e134c2ba2c5d 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -529,6 +529,7 @@ static inline int kvm_map_vectors(void)
 
 #ifdef CONFIG_ARM64_SSBD
 DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
+DECLARE_KVM_NVHE_PER_CPU(u64, arm64_ssbd_callback_required);
 
 static inline int hyp_map_aux_data(void)
 {
@@ -537,18 +538,29 @@ static inline int hyp_map_aux_data(void)
 	for_each_possible_cpu(cpu) {
 		u64 *ptr;
 
-		ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu);
+		ptr = per_cpu_ptr_nvhe_sym(arm64_ssbd_callback_required, cpu);
 		err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP);
 		if (err)
 			return err;
 	}
 	return 0;
 }
+
+static inline void hyp_init_aux_data(void)
+{
+	u64 *ptr;
+
+	/* Copy arm64_ssbd_callback_required value from kernel to hyp. */
+	ptr = this_cpu_ptr_nvhe_sym(arm64_ssbd_callback_required);
+	*ptr = __this_cpu_read(arm64_ssbd_callback_required);
+}
 #else
 static inline int hyp_map_aux_data(void)
 {
 	return 0;
 }
+
+static inline void hyp_init_aux_data(void) {}
 #endif
 
 #define kvm_phys_to_vttbr(addr)		phys_to_ttbr(addr)
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 76da2ad1010c..59d12a0b4622 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -67,7 +67,6 @@ KVM_NVHE_ALIAS(kvm_patch_vector_branch);
 KVM_NVHE_ALIAS(kvm_update_va_mask);
 
 /* Global kernel state accessed by nVHE hyp code. */
-KVM_NVHE_ALIAS(arm64_ssbd_callback_required);
 KVM_NVHE_ALIAS(kvm_host_data);
 KVM_NVHE_ALIAS(kvm_vgic_global_state);
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46dc3d75cf13..cbfff390f55f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1295,6 +1295,9 @@ static void cpu_init_hyp_mode(void)
 	    arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) {
 		kvm_call_hyp_nvhe(__kvm_enable_ssbs);
 	}
+
+	/* Copy information whether SSBD callback is required to hyp. */
+	hyp_init_aux_data();
 }
 
 static void cpu_hyp_reset(void)
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index cc4f8e790fb3..4662df6330d7 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -27,6 +27,9 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 
+/* Non-VHE copy of the kernel symbol. */
+DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
+
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	u64 val;

From 2a1198c9b436402582f7beed57028044b819329c Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:08 +0100
Subject: [PATCH 82/87] kvm: arm64: Create separate instances of kvm_host_data
 for VHE/nVHE

Host CPU context is stored in a global per-cpu variable `kvm_host_data`.
In preparation for introducing independent per-CPU region for nVHE hyp,
create two separate instances of `kvm_host_data`, one for VHE and one
for nVHE.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-9-dbrazdil@google.com
---
 arch/arm64/include/asm/kvm_host.h | 2 +-
 arch/arm64/kernel/image-vars.h    | 1 -
 arch/arm64/kvm/arm.c              | 5 ++---
 arch/arm64/kvm/hyp/nvhe/switch.c  | 3 +++
 arch/arm64/kvm/hyp/vhe/switch.c   | 3 +++
 arch/arm64/kvm/pmu.c              | 8 ++++----
 6 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e52c927aade5..964e05777fe3 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -565,7 +565,7 @@ void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
 
 struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
 
-DECLARE_PER_CPU(kvm_host_data_t, kvm_host_data);
+DECLARE_KVM_HYP_PER_CPU(kvm_host_data_t, kvm_host_data);
 
 static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
 {
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 59d12a0b4622..80da861b8180 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -67,7 +67,6 @@ KVM_NVHE_ALIAS(kvm_patch_vector_branch);
 KVM_NVHE_ALIAS(kvm_update_va_mask);
 
 /* Global kernel state accessed by nVHE hyp code. */
-KVM_NVHE_ALIAS(kvm_host_data);
 KVM_NVHE_ALIAS(kvm_vgic_global_state);
 
 /* Kernel constant needed to compute idmap addresses. */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index cbfff390f55f..a9d6f095031d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -46,7 +46,6 @@
 __asm__(".arch_extension	virt");
 #endif
 
-DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
 
 /* The VMID used in the VTTBR */
@@ -1308,7 +1307,7 @@ static void cpu_hyp_reset(void)
 
 static void cpu_hyp_reinit(void)
 {
-	kvm_init_host_cpu_context(&this_cpu_ptr(&kvm_host_data)->host_ctxt);
+	kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt);
 
 	cpu_hyp_reset();
 
@@ -1543,7 +1542,7 @@ static int init_hyp_mode(void)
 	for_each_possible_cpu(cpu) {
 		kvm_host_data_t *cpu_data;
 
-		cpu_data = per_cpu_ptr(&kvm_host_data, cpu);
+		cpu_data = per_cpu_ptr_hyp_sym(kvm_host_data, cpu);
 		err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
 
 		if (err) {
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 4662df6330d7..a7e9b03bd9d1 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -30,6 +30,9 @@
 /* Non-VHE copy of the kernel symbol. */
 DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
 
+/* Non-VHE instance of kvm_host_data. */
+DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	u64 val;
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 575e8054f116..0949fc97bf03 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -28,6 +28,9 @@
 
 const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
 
+/* VHE instance of kvm_host_data. */
+DEFINE_PER_CPU(kvm_host_data_t, kvm_host_data);
+
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	u64 val;
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index 3c224162b3dd..c869c851d2dd 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -31,7 +31,7 @@ static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
  */
 void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
 {
-	struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
+	struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
 
 	if (!kvm_pmu_switch_needed(attr))
 		return;
@@ -47,7 +47,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
  */
 void kvm_clr_pmu_events(u32 clr)
 {
-	struct kvm_host_data *ctx = this_cpu_ptr(&kvm_host_data);
+	struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
 
 	ctx->pmu_events.events_host &= ~clr;
 	ctx->pmu_events.events_guest &= ~clr;
@@ -173,7 +173,7 @@ void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
 		return;
 
 	preempt_disable();
-	host = this_cpu_ptr(&kvm_host_data);
+	host = this_cpu_ptr_hyp_sym(kvm_host_data);
 	events_guest = host->pmu_events.events_guest;
 	events_host = host->pmu_events.events_host;
 
@@ -193,7 +193,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
 	if (!has_vhe())
 		return;
 
-	host = this_cpu_ptr(&kvm_host_data);
+	host = this_cpu_ptr_hyp_sym(kvm_host_data);
 	events_guest = host->pmu_events.events_guest;
 	events_host = host->pmu_events.events_host;
 

From 30c953911c4370bfb622ee1c2fcc7e78c84df800 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:09 +0100
Subject: [PATCH 83/87] kvm: arm64: Set up hyp percpu data for nVHE

Add hyp percpu section to linker script and rename the corresponding ELF
sections of hyp/nvhe object files. This moves all nVHE-specific percpu
variables to the new hyp percpu section.

Allocate sufficient amount of memory for all percpu hyp regions at global KVM
init time and create corresponding hyp mappings.

The base addresses of hyp percpu regions are kept in a dynamically allocated
array in the kernel.

Add NULL checks in PMU event-reset code as it may run before KVM memory is
initialized.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-10-dbrazdil@google.com
---
 arch/arm64/include/asm/kvm_asm.h  | 19 +++++++++--
 arch/arm64/kernel/vmlinux.lds.S   |  8 +++++
 arch/arm64/kvm/arm.c              | 55 +++++++++++++++++++++++++++++--
 arch/arm64/kvm/hyp/nvhe/hyp.lds.S |  6 ++++
 arch/arm64/kvm/pmu.c              |  5 ++-
 5 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 911d91787fa0..863f669d4dc8 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -66,8 +66,19 @@
 #define CHOOSE_VHE_SYM(sym)	sym
 #define CHOOSE_NVHE_SYM(sym)	kvm_nvhe_sym(sym)
 
-#define this_cpu_ptr_nvhe_sym(sym)	this_cpu_ptr(&kvm_nvhe_sym(sym))
-#define per_cpu_ptr_nvhe_sym(sym, cpu)	per_cpu_ptr(&kvm_nvhe_sym(sym), cpu)
+/*
+ * Compute pointer to a symbol defined in nVHE percpu region.
+ * Returns NULL if percpu memory has not been allocated yet.
+ */
+#define this_cpu_ptr_nvhe_sym(sym)	per_cpu_ptr_nvhe_sym(sym, smp_processor_id())
+#define per_cpu_ptr_nvhe_sym(sym, cpu)						\
+	({									\
+		unsigned long base, off;					\
+		base = kvm_arm_hyp_percpu_base[cpu];				\
+		off = (unsigned long)&CHOOSE_NVHE_SYM(sym) -			\
+		      (unsigned long)&CHOOSE_NVHE_SYM(__per_cpu_start);		\
+		base ? (typeof(CHOOSE_NVHE_SYM(sym))*)(base + off) : NULL;	\
+	})
 
 #ifndef __KVM_NVHE_HYPERVISOR__
 /*
@@ -117,6 +128,10 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector);
 #define __kvm_hyp_init		CHOOSE_NVHE_SYM(__kvm_hyp_init)
 #define __kvm_hyp_vector	CHOOSE_HYP_SYM(__kvm_hyp_vector)
 
+extern unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
+DECLARE_KVM_NVHE_SYM(__per_cpu_start);
+DECLARE_KVM_NVHE_SYM(__per_cpu_end);
+
 #ifdef CONFIG_KVM_INDIRECT_VECTORS
 extern atomic_t arm64_el2_vector_last_slot;
 DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index d14166012e51..d52e6b5dbfd3 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -28,8 +28,15 @@ jiffies = jiffies_64;
 	__start___kvm_ex_table = .;				\
 	*(__kvm_ex_table)					\
 	__stop___kvm_ex_table = .;
+
+#define HYPERVISOR_PERCPU_SECTION				\
+	. = ALIGN(PAGE_SIZE);					\
+	HYP_SECTION_NAME(.data..percpu) : {			\
+		*(HYP_SECTION_NAME(.data..percpu))		\
+	}
 #else /* CONFIG_KVM */
 #define HYPERVISOR_EXTABLE
+#define HYPERVISOR_PERCPU_SECTION
 #endif
 
 #define HYPERVISOR_TEXT					\
@@ -195,6 +202,7 @@ SECTIONS
 	}
 
 	PERCPU_SECTION(L1_CACHE_BYTES)
+	HYPERVISOR_PERCPU_SECTION
 
 	.rela.dyn : ALIGN(8) {
 		*(.rela .rela*)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index a9d6f095031d..35bed6b6d0a6 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -47,6 +47,7 @@ __asm__(".arch_extension	virt");
 #endif
 
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
 
 /* The VMID used in the VTTBR */
 static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
@@ -1255,6 +1256,19 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 }
 
+static unsigned long nvhe_percpu_size(void)
+{
+	return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
+		(unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
+}
+
+static unsigned long nvhe_percpu_order(void)
+{
+	unsigned long size = nvhe_percpu_size();
+
+	return size ? get_order(size) : 0;
+}
+
 static void cpu_init_hyp_mode(void)
 {
 	phys_addr_t pgd_ptr;
@@ -1270,8 +1284,8 @@ static void cpu_init_hyp_mode(void)
 	 * kernel's mapping to the linear mapping, and store it in tpidr_el2
 	 * so that we can use adr_l to access per-cpu variables in EL2.
 	 */
-	tpidr_el2 = ((unsigned long)this_cpu_ptr(&kvm_host_data) -
-		     (unsigned long)kvm_ksym_ref(&kvm_host_data));
+	tpidr_el2 = (unsigned long)this_cpu_ptr_nvhe_sym(__per_cpu_start) -
+		    (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));
 
 	pgd_ptr = kvm_mmu_get_httbr();
 	hyp_stack_ptr = __this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE;
@@ -1461,8 +1475,10 @@ static void teardown_hyp_mode(void)
 	int cpu;
 
 	free_hyp_pgds();
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
 		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+		free_pages(kvm_arm_hyp_percpu_base[cpu], nvhe_percpu_order());
+	}
 }
 
 /**
@@ -1495,6 +1511,24 @@ static int init_hyp_mode(void)
 		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
 	}
 
+	/*
+	 * Allocate and initialize pages for Hypervisor-mode percpu regions.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct page *page;
+		void *page_addr;
+
+		page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
+		if (!page) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+
+		page_addr = page_address(page);
+		memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
+		kvm_arm_hyp_percpu_base[cpu] = (unsigned long)page_addr;
+	}
+
 	/*
 	 * Map the Hyp-code called directly from the host
 	 */
@@ -1539,6 +1573,21 @@ static int init_hyp_mode(void)
 		}
 	}
 
+	/*
+	 * Map Hyp percpu pages
+	 */
+	for_each_possible_cpu(cpu) {
+		char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu];
+		char *percpu_end = percpu_begin + nvhe_percpu_size();
+
+		err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
+
+		if (err) {
+			kvm_err("Cannot map hyp percpu region\n");
+			goto out_err;
+		}
+	}
+
 	for_each_possible_cpu(cpu) {
 		kvm_host_data_t *cpu_data;
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
index 3b13d1c7cd1a..bb2d986ff696 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp.lds.S
@@ -7,7 +7,13 @@
  */
 
 #include <asm/hyp_image.h>
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/cache.h>
+#include <asm/memory.h>
 
 SECTIONS {
 	HYP_SECTION(.text)
+	HYP_SECTION_NAME(.data..percpu) : {
+		PERCPU_INPUT(L1_CACHE_BYTES)
+	}
 }
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c
index c869c851d2dd..faf32a44ba04 100644
--- a/arch/arm64/kvm/pmu.c
+++ b/arch/arm64/kvm/pmu.c
@@ -33,7 +33,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr)
 {
 	struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
 
-	if (!kvm_pmu_switch_needed(attr))
+	if (!ctx || !kvm_pmu_switch_needed(attr))
 		return;
 
 	if (!attr->exclude_host)
@@ -49,6 +49,9 @@ void kvm_clr_pmu_events(u32 clr)
 {
 	struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data);
 
+	if (!ctx)
+		return;
+
 	ctx->pmu_events.events_host &= ~clr;
 	ctx->pmu_events.events_guest &= ~clr;
 }

From a3bb9c3a00551726590137e3974495ce6cf6b758 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Tue, 22 Sep 2020 21:49:10 +0100
Subject: [PATCH 84/87] kvm: arm64: Remove unnecessary hyp mappings

With all nVHE per-CPU variables being part of the hyp per-CPU region,
mapping them individual is not necessary any longer. They are mapped to hyp
as part of the overall per-CPU region.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Andrew Scull <ascull@google.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200922204910.7265-11-dbrazdil@google.com
---
 arch/arm64/include/asm/kvm_mmu.h | 20 --------------------
 arch/arm64/kvm/arm.c             | 16 ----------------
 2 files changed, 36 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e134c2ba2c5d..8e7919801196 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -531,21 +531,6 @@ static inline int kvm_map_vectors(void)
 DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);
 DECLARE_KVM_NVHE_PER_CPU(u64, arm64_ssbd_callback_required);
 
-static inline int hyp_map_aux_data(void)
-{
-	int cpu, err;
-
-	for_each_possible_cpu(cpu) {
-		u64 *ptr;
-
-		ptr = per_cpu_ptr_nvhe_sym(arm64_ssbd_callback_required, cpu);
-		err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP);
-		if (err)
-			return err;
-	}
-	return 0;
-}
-
 static inline void hyp_init_aux_data(void)
 {
 	u64 *ptr;
@@ -555,11 +540,6 @@ static inline void hyp_init_aux_data(void)
 	*ptr = __this_cpu_read(arm64_ssbd_callback_required);
 }
 #else
-static inline int hyp_map_aux_data(void)
-{
-	return 0;
-}
-
 static inline void hyp_init_aux_data(void) {}
 #endif
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 35bed6b6d0a6..92c88deea357 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1588,22 +1588,6 @@ static int init_hyp_mode(void)
 		}
 	}
 
-	for_each_possible_cpu(cpu) {
-		kvm_host_data_t *cpu_data;
-
-		cpu_data = per_cpu_ptr_hyp_sym(kvm_host_data, cpu);
-		err = create_hyp_mappings(cpu_data, cpu_data + 1, PAGE_HYP);
-
-		if (err) {
-			kvm_err("Cannot map host CPU state: %d\n", err);
-			goto out_err;
-		}
-	}
-
-	err = hyp_map_aux_data();
-	if (err)
-		kvm_err("Cannot map host auxiliary data: %d\n", err);
-
 	return 0;
 
 out_err:

From 030bdf3698b7c3af190dd1fe714f0545f23441d0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 2 Oct 2020 07:49:46 +0200
Subject: [PATCH 85/87] KVM: arm64: Fix some documentation build warnings

As warned with make htmldocs:

	.../Documentation/virt/kvm/devices/vcpu.rst:70: WARNING: Malformed table.
	Text in column margin in table line 2.

	=======  ======================================================
	-ENODEV: PMUv3 not supported or GIC not initialized
	-ENXIO:  PMUv3 not properly configured or in-kernel irqchip not
	         configured as required prior to calling this attribute
	-EBUSY:  PMUv3 already initialized
	-EINVAL: Invalid filter range
	=======  ======================================================

The ':' character for two lines are above the size of the column.
Besides that, other tables at the file doesn't use ':', so
just drop them.

While here, also fix this warning also introduced at the same patch:

	.../Documentation/virt/kvm/devices/vcpu.rst:88: WARNING: Block quote ends without a blank line; unexpected unindent.

By marking the C code as a literal block.

Fixes: 8be86a5eec04 ("KVM: arm64: Document PMU filtering API")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Link: https://lore.kernel.org/r/b5385dd0213f1f070667925bf7a807bf5270ba78.1601616399.git.mchehab+huawei@kernel.org
---
 Documentation/virt/kvm/devices/vcpu.rst | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
index da7c2ef7dafc..2acec3b9ef65 100644
--- a/Documentation/virt/kvm/devices/vcpu.rst
+++ b/Documentation/virt/kvm/devices/vcpu.rst
@@ -67,25 +67,25 @@ irqchip.
 :Returns:
 
 	 =======  ======================================================
-	 -ENODEV: PMUv3 not supported or GIC not initialized
-	 -ENXIO:  PMUv3 not properly configured or in-kernel irqchip not
+	 -ENODEV  PMUv3 not supported or GIC not initialized
+	 -ENXIO   PMUv3 not properly configured or in-kernel irqchip not
 	 	  configured as required prior to calling this attribute
-	 -EBUSY:  PMUv3 already initialized
-	 -EINVAL: Invalid filter range
+	 -EBUSY   PMUv3 already initialized
+	 -EINVAL  Invalid filter range
 	 =======  ======================================================
 
-Request the installation of a PMU event filter described as follows:
+Request the installation of a PMU event filter described as follows::
 
-struct kvm_pmu_event_filter {
-	__u16	base_event;
-	__u16	nevents;
+    struct kvm_pmu_event_filter {
+	    __u16	base_event;
+	    __u16	nevents;
 
-#define KVM_PMU_EVENT_ALLOW	0
-#define KVM_PMU_EVENT_DENY	1
+    #define KVM_PMU_EVENT_ALLOW	0
+    #define KVM_PMU_EVENT_DENY	1
 
-	__u8	action;
-	__u8	pad[3];
-};
+	    __u8	action;
+	    __u8	pad[3];
+    };
 
 A filter range is defined as the range [@base_event, @base_event + @nevents),
 together with an @action (KVM_PMU_EVENT_ALLOW or KVM_PMU_EVENT_DENY). The

From b259d137e91d80bf92eac453ffab179eb7941ede Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 30 Sep 2020 14:18:01 +0100
Subject: [PATCH 86/87] KVM: arm64: Pass level hint to TLBI during stage-2
 permission fault

Alex pointed out that we don't pass a level hint to the TLBI instruction
when handling a stage-2 permission fault, even though the walker does
at some point have the level information in its hands.

Rework stage2_update_leaf_attrs() so that it can optionally return the
level of the updated pte to its caller, which can in turn be used to
provide the correct TLBI level hint.

Reported-by: Alexandru Elisei <alexandru.elisei@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/595cc73e-636e-8b3a-f93a-b4e9fb218db8@arm.com
Link: https://lore.kernel.org/r/20200930131801.16889-1-will@kernel.org
---
 arch/arm64/kvm/hyp/pgtable.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 603d6b415337..0cdf6e461cbd 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -694,6 +694,7 @@ struct stage2_attr_data {
 	kvm_pte_t	attr_set;
 	kvm_pte_t	attr_clr;
 	kvm_pte_t	pte;
+	u32		level;
 };
 
 static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
@@ -706,6 +707,7 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 	if (!kvm_pte_valid(pte))
 		return 0;
 
+	data->level = level;
 	data->pte = pte;
 	pte &= ~data->attr_clr;
 	pte |= data->attr_set;
@@ -723,7 +725,8 @@ static int stage2_attr_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
 
 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
 				    u64 size, kvm_pte_t attr_set,
-				    kvm_pte_t attr_clr, kvm_pte_t *orig_pte)
+				    kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
+				    u32 *level)
 {
 	int ret;
 	kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
@@ -743,20 +746,24 @@ static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
 
 	if (orig_pte)
 		*orig_pte = data.pte;
+
+	if (level)
+		*level = data.level;
 	return 0;
 }
 
 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
 {
 	return stage2_update_leaf_attrs(pgt, addr, size, 0,
-					KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, NULL);
+					KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
+					NULL, NULL);
 }
 
 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
 {
 	kvm_pte_t pte = 0;
 	stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
-				 &pte);
+				 &pte, NULL);
 	dsb(ishst);
 	return pte;
 }
@@ -765,7 +772,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
 {
 	kvm_pte_t pte = 0;
 	stage2_update_leaf_attrs(pgt, addr, 1, 0, KVM_PTE_LEAF_ATTR_LO_S2_AF,
-				 &pte);
+				 &pte, NULL);
 	/*
 	 * "But where's the TLBI?!", you scream.
 	 * "Over in the core code", I sigh.
@@ -778,7 +785,7 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr)
 bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr)
 {
 	kvm_pte_t pte = 0;
-	stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte);
+	stage2_update_leaf_attrs(pgt, addr, 1, 0, 0, &pte, NULL);
 	return pte & KVM_PTE_LEAF_ATTR_LO_S2_AF;
 }
 
@@ -786,6 +793,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
 				   enum kvm_pgtable_prot prot)
 {
 	int ret;
+	u32 level;
 	kvm_pte_t set = 0, clr = 0;
 
 	if (prot & KVM_PGTABLE_PROT_R)
@@ -797,8 +805,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
 	if (prot & KVM_PGTABLE_PROT_X)
 		clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
 
-	ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL);
-	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, 0);
+	ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level);
+	if (!ret)
+		kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
 	return ret;
 }
 

From ffd1b63a5860968068e943eab33383a766d30f64 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 30 Sep 2020 11:24:42 +0100
Subject: [PATCH 87/87] KVM: arm64: Ensure user_mem_abort() return value is
 initialised

If a change in the MMU notifier sequence number forces user_mem_abort()
to return early when attempting to handle a stage-2 fault, we return
uninitialised stack to kvm_handle_guest_abort(), which could potentially
result in the injection of an external abort into the guest or a spurious
return to userspace. Neither or these are what we want to do.

Initialise 'ret' to 0 in user_mem_abort() so that bailing due to a
change in the MMU notrifier sequence number is treated as though the
fault was handled.

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Cc: Gavin Shan <gshan@redhat.com>
Cc: Alexandru Elisei <alexandru.elisei@arm.com>
Link: https://lore.kernel.org/r/20200930102442.16142-1-will@kernel.org
---
 arch/arm64/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c5c26a9cb85b..a816cb8e619b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -742,7 +742,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
 {
-	int ret;
+	int ret = 0;
 	bool write_fault, writable, force_pte = false;
 	bool exec_fault;
 	bool device = false;