From 4be8ddb48b1b6c6067fb59c846b9c6e19d6efe14 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 18 Apr 2023 23:47:37 +0200 Subject: [PATCH 01/17] KVM: arm64: Slightly optimize flush_context() bitmap_zero() is faster than bitmap_clear(), so use it to save a few cycles. Signed-off-by: Christophe JAILLET Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/97bf2743f3a302b3066aced02218b9da60690dd3.1681854412.git.christophe.jaillet@wanadoo.fr --- arch/arm64/kvm/vmid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index 08978d0672e7..bbf0677cfefa 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -47,7 +47,7 @@ static void flush_context(void) int cpu; u64 vmid; - bitmap_clear(vmid_map, 0, NUM_USER_VMIDS); + bitmap_zero(vmid_map, NUM_USER_VMIDS); for_each_possible_cpu(cpu) { vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0); From a00e9e4319c2a8a8b166da028292de83190e39a4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 18 Apr 2023 23:47:38 +0200 Subject: [PATCH 02/17] KVM: arm64: Use the bitmap API to allocate bitmaps Use bitmap_zalloc()/bitmap_free() instead of hand-writing them. It is less verbose and it improves the semantic. Signed-off-by: Christophe JAILLET Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/3c5043731db4d3635383e9326bc7e98e25de3288.1681854412.git.christophe.jaillet@wanadoo.fr --- arch/arm64/kvm/vmid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index bbf0677cfefa..7fe8ba1a2851 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -182,8 +182,7 @@ int __init kvm_arm_vmid_alloc_init(void) */ WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus()); atomic64_set(&vmid_generation, VMID_FIRST_VERSION); - vmid_map = kcalloc(BITS_TO_LONGS(NUM_USER_VMIDS), - sizeof(*vmid_map), GFP_KERNEL); + vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL); if (!vmid_map) return -ENOMEM; @@ -192,5 +191,5 @@ int __init kvm_arm_vmid_alloc_init(void) void __init kvm_arm_vmid_alloc_free(void) { - kfree(vmid_map); + bitmap_free(vmid_map); } From 1f0f4a2ef7a5693b135ce174e71f116db4bd684d Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 21 Apr 2023 07:16:05 +0000 Subject: [PATCH 03/17] KVM: arm64: Infer the PA offset from IPA in stage-2 map walker Until now, the page table walker counted increments to the PA and IPA of a walk in two separate places. While the PA is incremented as soon as a leaf PTE is installed in stage2_map_walker_try_leaf(), the IPA is actually bumped in the generic table walker context. Critically, __kvm_pgtable_visit() rereads the PTE after the LEAF callback returns to work out if a table or leaf was installed, and only bumps the IPA for a leaf PTE. This arrangement worked fine when we handled faults behind the write lock, as the walker had exclusive access to the stage-2 page tables. However, commit 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel") started handling all stage-2 faults behind the read lock, opening up a race where a walker could increment the PA but not the IPA of a walk. Nothing good ensues, as the walker starts mapping with the incorrect IPA -> PA relationship. For example, assume that two vCPUs took a data abort on the same IPA. One observes that dirty logging is disabled, and the other observed that it is enabled: vCPU attempting PMD mapping vCPU attempting PTE mapping ====================================== ===================================== /* install PMD */ stage2_make_pte(ctx, leaf); data->phys += granule; /* replace PMD with a table */ stage2_try_break_pte(ctx, data->mmu); stage2_make_pte(ctx, table); /* table is observed */ ctx.old = READ_ONCE(*ptep); table = kvm_pte_table(ctx.old, level); /* * map walk continues w/o incrementing * IPA. */ __kvm_pgtable_walk(..., level + 1); Bring an end to the whole mess by using the IPA as the single source of truth for how far along a walk has gotten. Work out the correct PA to map by calculating the IPA offset from the beginning of the walk and add that to the starting physical address. Cc: stable@vger.kernel.org Fixes: 1577cb5823ce ("KVM: arm64: Handle stage-2 faults in parallel") Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230421071606.1603916-2-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 1 + arch/arm64/kvm/hyp/pgtable.c | 32 ++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 4cd6762bda80..dc3c072e862f 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -209,6 +209,7 @@ struct kvm_pgtable_visit_ctx { kvm_pte_t old; void *arg; struct kvm_pgtable_mm_ops *mm_ops; + u64 start; u64 addr; u64 end; u32 level; diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 3d61bd3e591d..140f82300db5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -58,6 +58,7 @@ struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; + u64 start; u64 addr; u64 end; }; @@ -201,6 +202,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, .old = READ_ONCE(*ptep), .arg = data->walker->arg, .mm_ops = mm_ops, + .start = data->start, .addr = data->addr, .end = data->end, .level = level, @@ -293,6 +295,7 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker) { struct kvm_pgtable_walk_data walk_data = { + .start = ALIGN_DOWN(addr, PAGE_SIZE), .addr = ALIGN_DOWN(addr, PAGE_SIZE), .end = PAGE_ALIGN(walk_data.addr + size), .walker = walker, @@ -794,20 +797,43 @@ static bool stage2_pte_executable(kvm_pte_t pte) return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } +static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, + const struct stage2_map_data *data) +{ + u64 phys = data->phys; + + /* + * Stage-2 walks to update ownership data are communicated to the map + * walker using an invalid PA. Avoid offsetting an already invalid PA, + * which could overflow and make the address valid again. + */ + if (!kvm_phys_is_valid(phys)) + return phys; + + /* + * Otherwise, work out the correct PA based on how far the walk has + * gotten. + */ + return phys + (ctx->addr - ctx->start); +} + static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { + u64 phys = stage2_map_walker_phys_addr(ctx, data); + if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) return false; - return kvm_block_mapping_supported(ctx, data->phys); + return kvm_block_mapping_supported(ctx, phys); } static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { kvm_pte_t new; - u64 granule = kvm_granule_size(ctx->level), phys = data->phys; + u64 phys = stage2_map_walker_phys_addr(ctx, data); + u64 granule = kvm_granule_size(ctx->level); struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; @@ -841,8 +867,6 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, stage2_make_pte(ctx, new); - if (kvm_phys_is_valid(phys)) - data->phys += granule; return 0; } From 39bc95be3782ba88e55cd72e830f37e74395831b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 21 Apr 2023 07:16:06 +0000 Subject: [PATCH 04/17] KVM: arm64: Infer PA offset from VA in hyp map walker Similar to the recently fixed stage-2 walker, the hyp map walker increments the PA and VA of a walk separately. Unlike stage-2, there is no bug here as the map walker has exclusive access to the stage-1 page tables. Nonetheless, in the interest of continuity throughout the page table code, tweak the hyp map walker to avoid incrementing the PA and instead use the VA as the authoritative source of how far along a table walk has gotten. Calculate the PA to use for a leaf PTE by adding the offset of the VA from the start of the walk to the starting PA. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230421071606.1603916-3-oliver.upton@linux.dev --- arch/arm64/kvm/hyp/pgtable.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 140f82300db5..356a3fd5220c 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -410,13 +410,12 @@ enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct hyp_map_data *data) { + u64 phys = data->phys + (ctx->addr - ctx->start); kvm_pte_t new; - u64 granule = kvm_granule_size(ctx->level), phys = data->phys; if (!kvm_block_mapping_supported(ctx, phys)) return false; - data->phys += granule; new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); if (ctx->old == new) return true; From 1ea244158a4a20ac686f4d1d46285f3d38d4571f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 21 Apr 2023 10:18:34 +0100 Subject: [PATCH 05/17] KVM: arm64: Constify start/end/phys fields of the pgtable walker data As we are revamping the way the pgtable walker evaluates some of the data, make it clear that we rely on somew of the fields to be constant across the lifetime of a walk. For this, flag the start, end and phys fields of the walk data as 'const', which will generate an error if we were to accidentally update these fields again. Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/pgtable.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 356a3fd5220c..5282cb9ca4cf 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -58,9 +58,9 @@ struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; - u64 start; + const u64 start; u64 addr; - u64 end; + const u64 end; }; static bool kvm_phys_is_valid(u64 phys) @@ -352,7 +352,7 @@ int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, } struct hyp_map_data { - u64 phys; + const u64 phys; kvm_pte_t attr; }; @@ -578,7 +578,7 @@ void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) } struct stage2_map_data { - u64 phys; + const u64 phys; kvm_pte_t attr; u8 owner_id; From 3d1793562858f2bc42cc722fe00ec9b2ff0618e1 Mon Sep 17 00:00:00 2001 From: Jingyu Wang Date: Thu, 9 Mar 2023 15:59:19 +0800 Subject: [PATCH 06/17] KVM: arm64: Fix repeated words in comments Delete the redundant word 'to'. Signed-off-by: Jingyu Wang Reviewed-by: Mukesh Ojha Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230309075919.169518-1-jingyuwang_vip@163.com --- arch/arm64/kvm/inject_fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 64c3aec0d937..0bd93a5f21ce 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -204,7 +204,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu) * Size Fault at level 0, as if exceeding PARange. * * Non-LPAE guests will only get the external abort, as there - * is no way to to describe the ASF. + * is no way to describe the ASF. */ if (vcpu_el1_is_32bit(vcpu) && !(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE)) From 4c181e3d352e9280c84fb4b4c7a8940ce005374e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 7 Mar 2023 17:37:14 +0000 Subject: [PATCH 07/17] KVM: arm64: Document check for TIF_FOREIGN_FPSTATE In kvm_arch_vcpu_load_fp() we unconditionally set the current FP state to FP_STATE_HOST_OWNED, this will be overridden to FP_STATE_NONE if TIF_FOREIGN_FPSTATE is set but the check is deferred until kvm_arch_vcpu_ctxflush_fp() where we are no longer preemptable. Add a comment to this effect to help avoid people being concerned about the lack of a check and discover where the check is done. Signed-off-by: Mark Brown Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221214-kvm-arm64-sme-context-switch-v2-1-57ba0082e9ff@kernel.org --- arch/arm64/kvm/fpsimd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 1279949599b5..3fd0ce6a3500 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -81,6 +81,11 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) fpsimd_kvm_prepare(); + /* + * We will check TIF_FOREIGN_FPSTATE just before entering the + * guest in kvm_arch_vcpu_ctxflush_fp() and override this to + * FP_STATE_FREE if the flag set. + */ vcpu->arch.fp_state = FP_STATE_HOST_OWNED; vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); From d071cefdcca39fdbcdd4bf36868448820dbac34b Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 7 Mar 2023 17:37:15 +0000 Subject: [PATCH 08/17] KVM: arm64: Restructure check for SVE support in FP trap handler We share the same handler for general floating point and SVE traps with a check to make sure we don't handle any SVE traps if the system doesn't have SVE support. Since we will be adding SME support and wishing to handle that along with other FP related traps rewrite the check to be more scalable and a bit clearer too, ensuring we don't misidentify SME traps as SVE ones. Signed-off-by: Mark Brown Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221214-kvm-arm64-sme-context-switch-v2-2-57ba0082e9ff@kernel.org --- arch/arm64/kvm/hyp/include/hyp/switch.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 07d37ff88a3f..d29d2ebf9126 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -176,9 +176,17 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) sve_guest = vcpu_has_sve(vcpu); esr_ec = kvm_vcpu_trap_get_class(vcpu); - /* Don't handle SVE traps for non-SVE vcpus here: */ - if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD) + /* Only handle traps the vCPU can support here: */ + switch (esr_ec) { + case ESR_ELx_EC_FP_ASIMD: + break; + case ESR_ELx_EC_SVE: + if (!sve_guest) + return false; + break; + default: return false; + } /* Valid trap. Switch the context: */ From aaa2f14e6f3f34de8edfb13566110a0fe0d88785 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 7 Mar 2023 17:37:16 +0000 Subject: [PATCH 09/17] KVM: arm64: Clarify host SME state management Normally when running a guest we do not touch the floating point register state until first use of floating point by the guest, saving the current state and loading the guest state at that point. This has been found to offer a performance benefit in common cases. However currently if SME is active when switching to a guest then we exit streaming mode, disable ZA and invalidate the floating point register state prior to starting the guest. The exit from streaming mode is required for correct guest operation, if we leave streaming mode enabled then many non-SME operations can generate SME traps (eg, SVE operations will become streaming SVE operations). If EL1 leaves CPACR_EL1.SMEN disabled then the host is unable to intercept these traps. This will mean that a SME unaware guest will see SME exceptions which will confuse it. Disabling streaming mode also avoids creating spurious indications of usage of the SME hardware which could impact system performance, especially with shared SME implementations. Document the requirement to exit streaming mode clearly. There is no issue with guest operation caused by PSTATE.ZA so we can defer handling for that until first floating point usage, do so if the register state is not that of the current task and hence has already been saved. We could also do this for the case where the register state is that for the current task however this is very unlikely to happen and would require disproportionate effort so continue to save the state in that case. Saving this state on first use would require that we map and unmap storage for the host version of these registers for use by the hypervisor, taking care to deal with protected KVM and the fact that the host can free or reallocate the backing storage. Given that the strong recommendation is that applications should only keep PSTATE.ZA enabled when the state it enables is in active use it is difficult to see a case where a VMM would wish to do this, it would need to not only be using SME but also running the guest in the middle of SME usage. This can be revisited in the future if a use case does arises, in the interim such tasks will work but experience a performance overhead. This brings our handling of SME more into line with our handling of other floating point state and documents more clearly the constraints we have, especially around streaming mode. Signed-off-by: Mark Brown Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221214-kvm-arm64-sme-context-switch-v2-3-57ba0082e9ff@kernel.org --- arch/arm64/kvm/fpsimd.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 3fd0ce6a3500..4c9dcd8fc939 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -92,20 +92,23 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) vcpu_set_flag(vcpu, HOST_SVE_ENABLED); - /* - * We don't currently support SME guests but if we leave - * things in streaming mode then when the guest starts running - * FPSIMD or SVE code it may generate SME traps so as a - * special case if we are in streaming mode we force the host - * state to be saved now and exit streaming mode so that we - * don't have to handle any SME traps for valid guest - * operations. Do this for ZA as well for now for simplicity. - */ if (system_supports_sme()) { vcpu_clear_flag(vcpu, HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) vcpu_set_flag(vcpu, HOST_SME_ENABLED); + /* + * If PSTATE.SM is enabled then save any pending FP + * state and disable PSTATE.SM. If we leave PSTATE.SM + * enabled and the guest does not enable SME via + * CPACR_EL1.SMEN then operations that should be valid + * may generate SME traps from EL1 to EL1 which we + * can't intercept and which would confuse the guest. + * + * Do the same for PSTATE.ZA in the case where there + * is state in the registers which has not already + * been saved, this is very unlikely to happen. + */ if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { vcpu->arch.fp_state = FP_STATE_FREE; fpsimd_save_and_flush_cpu_state(); From e910baa9c1efdf7634519c135c6723b0fd499683 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 1 May 2023 19:21:41 +0100 Subject: [PATCH 10/17] KVM: arm64: vgic: Add Apple M2 PRO/MAX cpus to the list of broken SEIS implementations Unsurprisingly, the M2 PRO is also affected by the SEIS bug, so add it to the naughty list. And since M2 MAX is likely to be of the same ilk, flag it as well. Tested on a M2 PRO mini machine. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20230501182141.39770-1-maz@kernel.org --- arch/arm64/include/asm/cputype.h | 8 ++++++++ arch/arm64/kvm/vgic/vgic-v3.c | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 683ca3af4084..5f6f84837a49 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -126,6 +126,10 @@ #define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 #define APPLE_CPU_PART_M2_BLIZZARD 0x032 #define APPLE_CPU_PART_M2_AVALANCHE 0x033 +#define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034 +#define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035 +#define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038 +#define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 #define AMPERE_CPU_PART_AMPERE1 0xAC3 @@ -181,6 +185,10 @@ #define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX) #define MIDR_APPLE_M2_BLIZZARD MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD) #define MIDR_APPLE_M2_AVALANCHE MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE) +#define MIDR_APPLE_M2_BLIZZARD_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_PRO) +#define MIDR_APPLE_M2_AVALANCHE_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_PRO) +#define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) +#define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 469d816f356f..93a47a515c13 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -616,6 +616,10 @@ static const struct midr_range broken_seis[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX), {}, }; From 6735150b69978a9f73e3d1bab719e81a5dfafa83 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 May 2023 16:31:26 -0700 Subject: [PATCH 11/17] KVM: Use syscore_ops instead of reboot_notifier to hook restart/shutdown Use syscore_ops.shutdown to disable hardware virtualization during a reboot instead of using the dedicated reboot_notifier so that KVM disables virtualization _after_ system_state has been updated. This will allow fixing a race in KVM's handling of a forced reboot where KVM can end up enabling hardware virtualization between kernel_restart_prepare() and machine_restart(). Rename KVM's hook to match the syscore op to avoid any possible confusion from wiring up a "reboot" helper to a "shutdown" hook (neither "shutdown nor "reboot" is completely accurate as the hook handles both). Opportunistically rewrite kvm_shutdown()'s comment to make it less VMX specific, and to explain why kvm_rebooting exists. Cc: Marc Zyngier Cc: Oliver Upton Cc: James Morse Cc: Suzuki K Poulose Cc: Zenghui Yu Cc: kvmarm@lists.linux.dev Cc: Huacai Chen Cc: Aleksandar Markovic Cc: Anup Patel Cc: Atish Patra Cc: kvm-riscv@lists.infradead.org Signed-off-by: Sean Christopherson Acked-by: Marc Zyngier Message-Id: <20230512233127.804012-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb5c13eee193..2079d6065795 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5213,26 +5213,24 @@ static int hardware_enable_all(void) return r; } -static int kvm_reboot(struct notifier_block *notifier, unsigned long val, - void *v) +static void kvm_shutdown(void) { /* - * Some (well, at least mine) BIOSes hang on reboot if - * in vmx root mode. - * - * And Intel TXT required VMX off for all cpu when system shutdown. + * Disable hardware virtualization and set kvm_rebooting to indicate + * that KVM has asynchronously disabled hardware virtualization, i.e. + * that relevant errors and exceptions aren't entirely unexpected. + * Some flavors of hardware virtualization need to be disabled before + * transferring control to firmware (to perform shutdown/reboot), e.g. + * on x86, virtualization can block INIT interrupts, which are used by + * firmware to pull APs back under firmware control. Note, this path + * is used for both shutdown and reboot scenarios, i.e. neither name is + * 100% comprehensive. */ pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; on_each_cpu(hardware_disable_nolock, NULL, 1); - return NOTIFY_OK; } -static struct notifier_block kvm_reboot_notifier = { - .notifier_call = kvm_reboot, - .priority = 0, -}; - static int kvm_suspend(void) { /* @@ -5263,6 +5261,7 @@ static void kvm_resume(void) static struct syscore_ops kvm_syscore_ops = { .suspend = kvm_suspend, .resume = kvm_resume, + .shutdown = kvm_shutdown, }; #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ static int hardware_enable_all(void) @@ -5967,7 +5966,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (r) return r; - register_reboot_notifier(&kvm_reboot_notifier); register_syscore_ops(&kvm_syscore_ops); #endif @@ -6039,7 +6037,6 @@ err_cpu_kick_mask: err_vcpu_cache: #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); - unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); #endif return r; @@ -6065,7 +6062,6 @@ void kvm_exit(void) kvm_async_pf_deinit(); #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING unregister_syscore_ops(&kvm_syscore_ops); - unregister_reboot_notifier(&kvm_reboot_notifier); cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); #endif kvm_irqfd_exit(); From e0ceec221f62deb5d6c32c0327030028d3db5f27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 May 2023 16:31:27 -0700 Subject: [PATCH 12/17] KVM: Don't enable hardware after a restart/shutdown is initiated Reject hardware enabling, i.e. VM creation, if a restart/shutdown has been initiated to avoid re-enabling hardware between kvm_reboot() and machine_{halt,power_off,restart}(). The restart case is especially problematic (for x86) as enabling VMX (or clearing GIF in KVM_RUN on SVM) blocks INIT, which results in the restart/reboot hanging as BIOS is unable to wake and rendezvous with APs. Note, this bug, and the original issue that motivated the addition of kvm_reboot(), is effectively limited to a forced reboot, e.g. `reboot -f`. In a "normal" reboot, userspace will gracefully teardown userspace before triggering the kernel reboot (modulo bugs, errors, etc), i.e. any process that might do ioctl(KVM_CREATE_VM) is long gone. Fixes: 8e1c18157d87 ("KVM: VMX: Disable VMX when system shutdown") Signed-off-by: Sean Christopherson Acked-by: Marc Zyngier Message-Id: <20230512233127.804012-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2079d6065795..540e2bbf00f7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5184,7 +5184,20 @@ static void hardware_disable_all(void) static int hardware_enable_all(void) { atomic_t failed = ATOMIC_INIT(0); - int r = 0; + int r; + + /* + * Do not enable hardware virtualization if the system is going down. + * If userspace initiated a forced reboot, e.g. reboot -f, then it's + * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling + * after kvm_reboot() is called. Note, this relies on system_state + * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops + * hook instead of registering a dedicated reboot notifier (the latter + * runs before system_state is updated). + */ + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || + system_state == SYSTEM_RESTART) + return -EBUSY; /* * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() @@ -5197,6 +5210,8 @@ static int hardware_enable_all(void) cpus_read_lock(); mutex_lock(&kvm_lock); + r = 0; + kvm_usage_count++; if (kvm_usage_count == 1) { on_each_cpu(hardware_enable_nolock, &failed, 1); From 3367eeab975145e65136c2d091fe6e0c6cb29636 Mon Sep 17 00:00:00 2001 From: Jacob Xu Date: Thu, 24 Feb 2022 17:29:59 -0800 Subject: [PATCH 13/17] KVM: VMX: Fix header file dependency of asm/vmx.h Include a definition of WARN_ON_ONCE() before using it. Fixes: bb1fcc70d98f ("KVM: nVMX: Allow L1 to use 5-level page walks for nested EPT") Cc: Sean Christopherson Signed-off-by: Jacob Xu [reworded commit message; changed to ] Signed-off-by: Jim Mattson Reviewed-by: Sean Christopherson Message-Id: <20220225012959.1554168-1-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 498dc600bd5c..0d02c4aafa6f 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -13,7 +13,9 @@ #include +#include #include + #include #include From afb2acb2e3a32e4d56f7fbd819769b98ed1b7520 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 10 May 2023 16:04:09 +0200 Subject: [PATCH 14/17] KVM: Fix vcpu_array[0] races In kvm_vm_ioctl_create_vcpu(), add vcpu to vcpu_array iff it's safe to access vcpu via kvm_get_vcpu() and kvm_for_each_vcpu(), i.e. when there's no failure path requiring vcpu removal and destruction. Such order is important because vcpu_array accessors may end up referencing vcpu at vcpu_array[0] even before online_vcpus is set to 1. When online_vcpus=0, any call to kvm_get_vcpu() goes through array_index_nospec() and ends with an attempt to xa_load(vcpu_array, 0): int num_vcpus = atomic_read(&kvm->online_vcpus); i = array_index_nospec(i, num_vcpus); return xa_load(&kvm->vcpu_array, i); Similarly, when online_vcpus=0, a kvm_for_each_vcpu() does not iterate over an "empty" range, but actually [0, ULONG_MAX]: xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ (atomic_read(&kvm->online_vcpus) - 1)) In both cases, such online_vcpus=0 edge case, even if leading to unnecessary calls to XArray API, should not be an issue; requesting unpopulated indexes/ranges is handled by xa_load() and xa_for_each_range(). However, this means that when the first vCPU is created and inserted in vcpu_array *and* before online_vcpus is incremented, code calling kvm_get_vcpu()/kvm_for_each_vcpu() already has access to that first vCPU. This should not pose a problem assuming that once a vcpu is stored in vcpu_array, it will remain there, but that's not the case: kvm_vm_ioctl_create_vcpu() first inserts to vcpu_array, then requests a file descriptor. If create_vcpu_fd() fails, newly inserted vcpu is removed from the vcpu_array, then destroyed: vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); if (r < 0) { xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); kvm_put_kvm_no_destroy(kvm); goto unlock_vcpu_destroy; } atomic_inc(&kvm->online_vcpus); This results in a possible race condition when a reference to a vcpu is acquired (via kvm_get_vcpu() or kvm_for_each_vcpu()) moments before said vcpu is destroyed. Signed-off-by: Michal Luczaj Message-Id: <20230510140410.1093987-2-mhal@rbox.co> Cc: stable@vger.kernel.org Fixes: c5b077549136 ("KVM: Convert the kvm->vcpus array to a xarray", 2021-12-08) Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 540e2bbf00f7..479802a892d4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3962,18 +3962,19 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) } vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); - r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); - BUG_ON(r == -EBUSY); + r = xa_reserve(&kvm->vcpu_array, vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT); if (r) goto unlock_vcpu_destroy; /* Now it's all set up, let userspace reach it */ kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); - if (r < 0) { - xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); - kvm_put_kvm_no_destroy(kvm); - goto unlock_vcpu_destroy; + if (r < 0) + goto kvm_put_xa_release; + + if (KVM_BUG_ON(!!xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) { + r = -EINVAL; + goto kvm_put_xa_release; } /* @@ -3988,6 +3989,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) kvm_create_vcpu_debugfs(vcpu); return r; +kvm_put_xa_release: + kvm_put_kvm_no_destroy(kvm); + xa_release(&kvm->vcpu_array, vcpu->vcpu_idx); unlock_vcpu_destroy: mutex_unlock(&kvm->lock); kvm_dirty_ring_free(&vcpu->dirty_ring); From ad45413d22e6a224f8530b6fcc9ac01c8ced7fd6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 3 May 2023 09:08:36 -0700 Subject: [PATCH 15/17] KVM: VMX: Don't rely _only_ on CPUID to enforce XCR0 restrictions for ECREATE Explicitly check the vCPU's supported XCR0 when determining whether or not the XFRM for ECREATE is valid. Checking CPUID works because KVM updates guest CPUID.0x12.1 to restrict the leaf to a subset of the guest's allowed XCR0, but that is rather subtle and KVM should not modify guest CPUID except for modeling true runtime behavior (allowed XFRM is most definitely not "runtime" behavior). Reviewed-by: Kai Huang Tested-by: Kai Huang Signed-off-by: Sean Christopherson Message-Id: <20230503160838.3412617-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/sgx.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 0574030b071f..2261b684a7d4 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -170,12 +170,19 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, return 1; } - /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */ + /* + * Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. Note + * that the allowed XFRM (XFeature Request Mask) isn't strictly bound + * by the supported XCR0. FP+SSE *must* be set in XFRM, even if XSAVE + * is unsupported, i.e. even if XCR0 itself is completely unsupported. + */ if ((u32)miscselect & ~sgx_12_0->ebx || (u32)attributes & ~sgx_12_1->eax || (u32)(attributes >> 32) & ~sgx_12_1->ebx || (u32)xfrm & ~sgx_12_1->ecx || - (u32)(xfrm >> 32) & ~sgx_12_1->edx) { + (u32)(xfrm >> 32) & ~sgx_12_1->edx || + xfrm & ~(vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE) || + (xfrm & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { kvm_inject_gp(vcpu, 0); return 1; } From 275a87244ec8320e5d319132caa787329b04bb7e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 3 May 2023 09:08:37 -0700 Subject: [PATCH 16/17] KVM: x86: Don't adjust guest's CPUID.0x12.1 (allowed SGX enclave XFRM) Drop KVM's manipulation of guest's CPUID.0x12.1 ECX and EDX, i.e. the allowed XFRM of SGX enclaves, now that KVM explicitly checks the guest's allowed XCR0 when emulating ECREATE. Note, this could theoretically break a setup where userspace advertises a "bad" XFRM and relies on KVM to provide a sane CPUID model, but QEMU is the only known user of KVM SGX, and QEMU explicitly sets the SGX CPUID XFRM subleaf based on the guest's XCR0. Reviewed-by: Kai Huang Tested-by: Kai Huang Signed-off-by: Sean Christopherson Message-Id: <20230503160838.3412617-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 123bf8b97a4b..0c9660a07b23 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -253,7 +253,6 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e int nent) { struct kvm_cpuid_entry2 *best; - u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent); best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT); if (best) { @@ -292,21 +291,6 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } - - /* - * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate - * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's - * requested XCR0 value. The enclave's XFRM must be a subset of XCRO - * at the time of EENTER, thus adjust the allowed XFRM by the guest's - * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to - * '1' even on CPUs that don't support XSAVE. - */ - best = cpuid_entry2_find(entries, nent, 0x12, 0x1); - if (best) { - best->ecx &= guest_supported_xcr0 & 0xffffffff; - best->edx &= guest_supported_xcr0 >> 32; - best->ecx |= XFEATURE_MASK_FPSSE; - } } void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) From b9846a698c9aff4eb2214a06ac83638ad098f33f Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Tue, 9 May 2023 03:23:48 +0000 Subject: [PATCH 17/17] KVM: VMX: add MSR_IA32_TSX_CTRL into msrs_to_save Add MSR_IA32_TSX_CTRL into msrs_to_save[] to explicitly tell userspace to save/restore the register value during migration. Missing this may cause userspace that relies on KVM ioctl(KVM_GET_MSR_INDEX_LIST) fail to port the value to the target VM. In addition, there is no need to add MSR_IA32_TSX_CTRL when ARCH_CAP_TSX_CTRL_MSR is not supported in kvm_get_arch_capabilities(). So add the checking in kvm_probe_msr_to_save(). Fixes: c11f83e0626b ("KVM: vmx: implement MSR_IA32_TSX_CTRL disable RTM functionality") Reported-by: Jim Mattson Signed-off-by: Mingwei Zhang Reviewed-by: Xiaoyao Li Reviewed-by: Jim Mattson Message-Id: <20230509032348.1153070-1-mizhang@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ceb7c5e9cf9e..c0778ca39650 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1446,7 +1446,7 @@ static const u32 msrs_to_save_base[] = { #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, @@ -7155,6 +7155,10 @@ static void kvm_probe_msr_to_save(u32 msr_index) if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) return; break; + case MSR_IA32_TSX_CTRL: + if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) + return; + break; default: break; }