linux/arch/arm64/kvm/pkvm.c
Will Deacon 87727ba2bb KVM: arm64: Ensure CPU PMU probes before pKVM host de-privilege
Although pKVM supports CPU PMU emulation for non-protected guests since
722625c6f4 ("KVM: arm64: Reenable pmu in Protected Mode"), this relies
on the PMU driver probing before the host has de-privileged so that the
'kvm_arm_pmu_available' static key can still be enabled by patching the
hypervisor text.

As it happens, both of these events hang off device_initcall() but the
PMU consistently won the race until 7755cec63a ("arm64: perf: Move
PMUv3 driver to drivers/perf"). Since then, the host will fail to boot
when pKVM is enabled:

  | hw perfevents: enabled with armv8_pmuv3_0 PMU driver, 7 counters available
  | kvm [1]: nVHE hyp BUG at: [<ffff8000090366e0>] __kvm_nvhe_handle_host_mem_abort+0x270/0x284!
  | kvm [1]: Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE
  | kvm [1]: Hyp Offset: 0xfffea41fbdf70000
  | Kernel panic - not syncing: HYP panic:
  | PS:a00003c9 PC:0000dbe04b0c66e0 ESR:00000000f2000800
  | FAR:fffffbfffddfcf00 HPFAR:00000000010b0bf0 PAR:0000000000000000
  | VCPU:0000000000000000
  | CPU: 2 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc7-00083-g0bce6746d154 #1
  | Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015
  | Call trace:
  |  dump_backtrace+0xec/0x108
  |  show_stack+0x18/0x2c
  |  dump_stack_lvl+0x50/0x68
  |  dump_stack+0x18/0x24
  |  panic+0x13c/0x33c
  |  nvhe_hyp_panic_handler+0x10c/0x190
  |  aarch64_insn_patch_text_nosync+0x64/0xc8
  |  arch_jump_label_transform+0x4c/0x5c
  |  __jump_label_update+0x84/0xfc
  |  jump_label_update+0x100/0x134
  |  static_key_enable_cpuslocked+0x68/0xac
  |  static_key_enable+0x20/0x34
  |  kvm_host_pmu_init+0x88/0xa4
  |  armpmu_register+0xf0/0xf4
  |  arm_pmu_acpi_probe+0x2ec/0x368
  |  armv8_pmu_driver_init+0x38/0x44
  |  do_one_initcall+0xcc/0x240

Fix the race properly by deferring the de-privilege step to
device_initcall_sync(). This will also be needed in future when probing
IOMMU devices and allows us to separate the pKVM de-privilege logic from
the core hypervisor initialisation path.

Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Fixes: 7755cec63a ("arm64: perf: Move PMUv3 driver to drivers/perf")
Tested-by: Fuad Tabba <tabba@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230420123356.2708-1-will@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
2023-04-20 16:57:53 +01:00

263 lines
5.9 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2020 - Google LLC
* Author: Quentin Perret <qperret@google.com>
*/
#include <linux/init.h>
#include <linux/kmemleak.h>
#include <linux/kvm_host.h>
#include <linux/memblock.h>
#include <linux/mutex.h>
#include <linux/sort.h>
#include <asm/kvm_pkvm.h>
#include "hyp_constants.h"
DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);
phys_addr_t hyp_mem_base;
phys_addr_t hyp_mem_size;
static int cmp_hyp_memblock(const void *p1, const void *p2)
{
const struct memblock_region *r1 = p1;
const struct memblock_region *r2 = p2;
return r1->base < r2->base ? -1 : (r1->base > r2->base);
}
static void __init sort_memblock_regions(void)
{
sort(hyp_memory,
*hyp_memblock_nr_ptr,
sizeof(struct memblock_region),
cmp_hyp_memblock,
NULL);
}
static int __init register_memblock_regions(void)
{
struct memblock_region *reg;
for_each_mem_region(reg) {
if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
return -ENOMEM;
hyp_memory[*hyp_memblock_nr_ptr] = *reg;
(*hyp_memblock_nr_ptr)++;
}
sort_memblock_regions();
return 0;
}
void __init kvm_hyp_reserve(void)
{
u64 hyp_mem_pages = 0;
int ret;
if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
return;
if (kvm_get_mode() != KVM_MODE_PROTECTED)
return;
ret = register_memblock_regions();
if (ret) {
*hyp_memblock_nr_ptr = 0;
kvm_err("Failed to register hyp memblocks: %d\n", ret);
return;
}
hyp_mem_pages += hyp_s1_pgtable_pages();
hyp_mem_pages += host_s2_pgtable_pages();
hyp_mem_pages += hyp_vm_table_pages();
hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
/*
* Try to allocate a PMD-aligned region to reduce TLB pressure once
* this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
*/
hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
PMD_SIZE);
if (!hyp_mem_base)
hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
else
hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);
if (!hyp_mem_base) {
kvm_err("Failed to reserve hyp memory\n");
return;
}
kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
hyp_mem_base);
}
/*
* Allocates and donates memory for hypervisor VM structs at EL2.
*
* Allocates space for the VM state, which includes the hyp vm as well as
* the hyp vcpus.
*
* Stores an opaque handler in the kvm struct for future reference.
*
* Return 0 on success, negative error code on failure.
*/
static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
{
size_t pgd_sz, hyp_vm_sz, hyp_vcpu_sz;
struct kvm_vcpu *host_vcpu;
pkvm_handle_t handle;
void *pgd, *hyp_vm;
unsigned long idx;
int ret;
if (host_kvm->created_vcpus < 1)
return -EINVAL;
pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);
/*
* The PGD pages will be reclaimed using a hyp_memcache which implies
* page granularity. So, use alloc_pages_exact() to get individual
* refcounts.
*/
pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
if (!pgd)
return -ENOMEM;
/* Allocate memory to donate to hyp for vm and vcpu pointers. */
hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
size_mul(sizeof(void *),
host_kvm->created_vcpus)));
hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
if (!hyp_vm) {
ret = -ENOMEM;
goto free_pgd;
}
/* Donate the VM memory to hyp and let hyp initialize it. */
ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd);
if (ret < 0)
goto free_vm;
handle = ret;
host_kvm->arch.pkvm.handle = handle;
/* Donate memory for the vcpus at hyp and initialize it. */
hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
void *hyp_vcpu;
/* Indexing of the vcpus to be sequential starting at 0. */
if (WARN_ON(host_vcpu->vcpu_idx != idx)) {
ret = -EINVAL;
goto destroy_vm;
}
hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
if (!hyp_vcpu) {
ret = -ENOMEM;
goto destroy_vm;
}
ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu,
hyp_vcpu);
if (ret) {
free_pages_exact(hyp_vcpu, hyp_vcpu_sz);
goto destroy_vm;
}
}
return 0;
destroy_vm:
pkvm_destroy_hyp_vm(host_kvm);
return ret;
free_vm:
free_pages_exact(hyp_vm, hyp_vm_sz);
free_pgd:
free_pages_exact(pgd, pgd_sz);
return ret;
}
int pkvm_create_hyp_vm(struct kvm *host_kvm)
{
int ret = 0;
mutex_lock(&host_kvm->lock);
if (!host_kvm->arch.pkvm.handle)
ret = __pkvm_create_hyp_vm(host_kvm);
mutex_unlock(&host_kvm->lock);
return ret;
}
void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{
if (host_kvm->arch.pkvm.handle) {
WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
host_kvm->arch.pkvm.handle));
}
host_kvm->arch.pkvm.handle = 0;
free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
}
int pkvm_init_host_vm(struct kvm *host_kvm)
{
mutex_init(&host_kvm->lock);
return 0;
}
static void __init _kvm_host_prot_finalize(void *arg)
{
int *err = arg;
if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
WRITE_ONCE(*err, -EINVAL);
}
static int __init pkvm_drop_host_privileges(void)
{
int ret = 0;
/*
* Flip the static key upfront as that may no longer be possible
* once the host stage 2 is installed.
*/
static_branch_enable(&kvm_protected_mode_initialized);
on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
return ret;
}
static int __init finalize_pkvm(void)
{
int ret;
if (!is_protected_kvm_enabled())
return 0;
/*
* Exclude HYP sections from kmemleak so that they don't get peeked
* at, which would end badly once inaccessible.
*/
kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);
ret = pkvm_drop_host_privileges();
if (ret)
pr_err("Failed to finalize Hyp protection: %d\n", ret);
return ret;
}
device_initcall_sync(finalize_pkvm);