diff --git a/Documentation/arch/x86/mds.rst b/Documentation/arch/x86/mds.rst index e73fdff62c0a..c58c72362911 100644 --- a/Documentation/arch/x86/mds.rst +++ b/Documentation/arch/x86/mds.rst @@ -95,6 +95,9 @@ The kernel provides a function to invoke the buffer clearing: mds_clear_cpu_buffers() +Also macro CLEAR_CPU_BUFFERS can be used in ASM late in exit-to-user path. +Other than CFLAGS.ZF, this macro doesn't clobber any registers. + The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state (idle) transitions. @@ -138,17 +141,30 @@ Mitigation points When transitioning from kernel to user space the CPU buffers are flushed on affected CPUs when the mitigation is not disabled on the kernel - command line. The migitation is enabled through the static key - mds_user_clear. + command line. The mitigation is enabled through the feature flag + X86_FEATURE_CLEAR_CPU_BUF. - The mitigation is invoked in prepare_exit_to_usermode() which covers - all but one of the kernel to user space transitions. The exception - is when we return from a Non Maskable Interrupt (NMI), which is - handled directly in do_nmi(). + The mitigation is invoked just before transitioning to userspace after + user registers are restored. This is done to minimize the window in + which kernel data could be accessed after VERW e.g. via an NMI after + VERW. - (The reason that NMI is special is that prepare_exit_to_usermode() can - enable IRQs. In NMI context, NMIs are blocked, and we don't want to - enable IRQs with NMIs blocked.) + **Corner case not handled** + Interrupts returning to kernel don't clear CPUs buffers since the + exit-to-user path is expected to do that anyways. But, there could be + a case when an NMI is generated in kernel after the exit-to-user path + has cleared the buffers. This case is not handled and NMI returning to + kernel don't clear CPU buffers because: + + 1. It is rare to get an NMI after VERW, but before returning to userspace. + 2. For an unprivileged user, there is no known way to make that NMI + less rare or target it. + 3. It would take a large number of these precisely-timed NMIs to mount + an actual attack. There's presumably not enough bandwidth. + 4. The NMI in question occurs after a VERW, i.e. when user state is + restored and most interesting data is already scrubbed. Whats left + is only the data that NMI touches, and that may or may not be of + any interest. 2. C-State transition diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 8c8d38f0cb1d..003379049924 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -6,6 +6,9 @@ #include #include #include +#include +#include +#include .pushsection .noinstr.text, "ax" @@ -20,3 +23,23 @@ SYM_FUNC_END(entry_ibpb) EXPORT_SYMBOL_GPL(entry_ibpb); .popsection + +/* + * Define the VERW operand that is disguised as entry code so that + * it can be referenced with KPTI enabled. This ensure VERW can be + * used late in exit-to-user path after page tables are switched. + */ +.pushsection .entry.text, "ax" + +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_START_NOALIGN(mds_verw_sel) + UNWIND_HINT_UNDEFINED + ANNOTATE_NOENDBR + .word __KERNEL_DS +.align L1_CACHE_BYTES, 0xcc +SYM_CODE_END(mds_verw_sel); +/* For KVM */ +EXPORT_SYMBOL_GPL(mds_verw_sel); + +.popsection + diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c73047bf9f4b..fba427646805 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -885,6 +885,7 @@ SYM_FUNC_START(entry_SYSENTER_32) BUG_IF_WRONG_CR3 no_user_check=1 popfl popl %eax + CLEAR_CPU_BUFFERS /* * Return back to the vDSO, which will pop ecx and edx. @@ -954,6 +955,7 @@ restore_all_switch_stack: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code + CLEAR_CPU_BUFFERS .Lirq_return: /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization @@ -1146,6 +1148,7 @@ SYM_CODE_START(asm_exc_nmi) /* Not on SYSENTER stack. */ call exc_nmi + CLEAR_CPU_BUFFERS jmp .Lnmi_return .Lnmi_from_sysenter_stack: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c40f89ab1b4c..9bb485977629 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -161,6 +161,7 @@ syscall_return_via_sysret: SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR swapgs + CLEAR_CPU_BUFFERS sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR @@ -573,6 +574,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL) .Lswapgs_and_iret: swapgs + CLEAR_CPU_BUFFERS /* Assert that the IRET frame indicates user mode. */ testb $3, 8(%rsp) jnz .Lnative_iret @@ -723,6 +725,8 @@ native_irq_return_ldt: */ popq %rax /* Restore user RAX */ + CLEAR_CPU_BUFFERS + /* * RSP now points to an ordinary IRET frame, except that the page * is read-only and RSP[31:16] are preloaded with the userspace @@ -1449,6 +1453,12 @@ nmi_restore: std movq $0, 5*8(%rsp) /* clear "NMI executing" */ + /* + * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like + * NMI in kernel after user state is restored. For an unprivileged user + * these conditions are hard to meet. + */ + /* * iretq reads the "iret" frame and exits the NMI stack in a * single instruction. We are returning to kernel mode, so this @@ -1466,6 +1476,7 @@ SYM_CODE_START(entry_SYSCALL32_ignore) UNWIND_HINT_END_OF_STACK ENDBR mov $-ENOSYS, %eax + CLEAR_CPU_BUFFERS sysretl SYM_CODE_END(entry_SYSCALL32_ignore) diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index de94e2e84ecc..eabf48c4d4b4 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -270,6 +270,7 @@ SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) xorl %r9d, %r9d xorl %r10d, %r10d swapgs + CLEAR_CPU_BUFFERS sysretl SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index fdf723b6f6d0..2b62cdd8dd12 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -95,7 +95,7 @@ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ -/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_CLEAR_CPU_BUF ( 3*32+18) /* "" Clear CPU buffers using VERW */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index ce8f50192ae3..7e523bb3d2d3 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -91,7 +91,6 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, static __always_inline void arch_exit_to_user_mode(void) { - mds_user_clear_cpu_buffers(); amd_clear_divider(); } #define arch_exit_to_user_mode arch_exit_to_user_mode diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 262e65539f83..2aa52cab1e46 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -315,6 +315,17 @@ #endif .endm +/* + * Macro to execute VERW instruction that mitigate transient data sampling + * attacks such as MDS. On affected systems a microcode update overloaded VERW + * instruction to also clear the CPU buffers. VERW clobbers CFLAGS.ZF. + * + * Note: Only the memory operand variant of VERW clears the CPU buffers. + */ +.macro CLEAR_CPU_BUFFERS + ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF +.endm + #else /* __ASSEMBLY__ */ #define ANNOTATE_RETPOLINE_SAFE \ @@ -529,13 +540,14 @@ DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -DECLARE_STATIC_KEY_FALSE(mds_user_clear); DECLARE_STATIC_KEY_FALSE(mds_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear); +extern u16 mds_verw_sel; + #include /** @@ -561,17 +573,6 @@ static __always_inline void mds_clear_cpu_buffers(void) asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); } -/** - * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability - * - * Clear CPU buffers if the corresponding static key is enabled - */ -static __always_inline void mds_user_clear_cpu_buffers(void) -{ - if (static_branch_likely(&mds_user_clear)) - mds_clear_cpu_buffers(); -} - /** * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability * diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bb0ab8466b91..48d049cd74e7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -111,9 +111,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -/* Control MDS CPU buffer clear before returning to user space */ -DEFINE_STATIC_KEY_FALSE(mds_user_clear); -EXPORT_SYMBOL_GPL(mds_user_clear); /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -252,7 +249,7 @@ static void __init mds_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) @@ -356,7 +353,7 @@ static void __init taa_select_mitigation(void) * For guests that can't determine whether the correct microcode is * present on host, enable the mitigation for UCODE_NEEDED as well. */ - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); @@ -424,7 +421,7 @@ static void __init mmio_select_mitigation(void) */ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM))) - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); else static_branch_enable(&mmio_stale_data_clear); @@ -484,12 +481,12 @@ static void __init md_clear_update_mitigation(void) if (cpu_mitigations_off()) return; - if (!static_key_enabled(&mds_user_clear)) + if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) goto out; /* - * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data - * mitigation, if necessary. + * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO + * Stale Data mitigation, if necessary. */ if (mds_mitigation == MDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MDS)) { diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 17e955ab69fe..3082cf24b69e 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -563,9 +563,6 @@ nmi_restart: } if (this_cpu_dec_return(nmi_state)) goto nmi_restart; - - if (user_mode(regs)) - mds_user_clear_cpu_buffers(); } #if IS_ENABLED(CONFIG_KVM_INTEL) diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h index edc3f16cc189..6a9bfdfbb6e5 100644 --- a/arch/x86/kvm/vmx/run_flags.h +++ b/arch/x86/kvm/vmx/run_flags.h @@ -2,7 +2,10 @@ #ifndef __KVM_X86_VMX_RUN_FLAGS_H #define __KVM_X86_VMX_RUN_FLAGS_H -#define VMX_RUN_VMRESUME (1 << 0) -#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) +#define VMX_RUN_VMRESUME_SHIFT 0 +#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 + +#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) +#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) #endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 906ecd001511..2bfbf758d061 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -139,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run) mov (%_ASM_SP), %_ASM_AX /* Check if vmlaunch or vmresume is needed */ - test $VMX_RUN_VMRESUME, %ebx + bt $VMX_RUN_VMRESUME_SHIFT, %ebx /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX @@ -161,8 +161,11 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */ - jz .Lvmlaunch + /* Clobbers EFLAGS.ZF */ + CLEAR_CPU_BUFFERS + + /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ + jnc .Lvmlaunch /* * After a successful VMRESUME/VMLAUNCH, control flow "magically" diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1111d9d08903..88a4ff200d04 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -388,7 +388,16 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - vmx->disable_fb_clear = (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + /* + * Disable VERW's behavior of clearing CPU buffers for the guest if the + * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled + * the mitigation. Disabling the clearing behavior provides a + * performance boost for guests that aren't aware that manually clearing + * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry + * and VM-Exit. + */ + vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && + (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA); @@ -7224,11 +7233,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_enter_irqoff(); - /* L1D Flush includes CPU buffer clear to mitigate MDS */ + /* + * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW + * mitigation for MDS is done late in VMentry and is still + * executed in spite of L1D Flush. This is because an extra VERW + * should not matter much after the big hammer L1D Flush. + */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) - mds_clear_cpu_buffers(); else if (static_branch_unlikely(&mmio_stale_data_clear) && kvm_arch_has_assigned_device(vcpu->kvm)) mds_clear_cpu_buffers();