2018-12-04 05:53:04 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
#ifndef __KVM_X86_VMX_VMCS_H
|
|
|
|
#define __KVM_X86_VMX_VMCS_H
|
|
|
|
|
|
|
|
#include <linux/ktime.h>
|
2018-12-04 05:53:05 +08:00
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/nospec.h>
|
2018-12-04 05:53:04 +08:00
|
|
|
|
2018-12-04 05:53:05 +08:00
|
|
|
#include <asm/kvm.h>
|
2018-12-04 05:53:04 +08:00
|
|
|
#include <asm/vmx.h>
|
|
|
|
|
|
|
|
#include "capabilities.h"
|
|
|
|
|
2021-08-09 17:34:08 +08:00
|
|
|
#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
|
|
|
|
|
2018-12-04 05:53:04 +08:00
|
|
|
struct vmcs_hdr {
|
|
|
|
u32 revision_id:31;
|
|
|
|
u32 shadow_vmcs:1;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct vmcs {
|
|
|
|
struct vmcs_hdr hdr;
|
|
|
|
u32 abort;
|
2020-05-08 02:56:18 +08:00
|
|
|
char data[];
|
2018-12-04 05:53:04 +08:00
|
|
|
};
|
|
|
|
|
2018-12-04 05:53:06 +08:00
|
|
|
DECLARE_PER_CPU(struct vmcs *, current_vmcs);
|
|
|
|
|
2018-12-04 05:53:04 +08:00
|
|
|
/*
|
|
|
|
* vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
|
|
|
|
* and whose values change infrequently, but are not constant. I.e. this is
|
|
|
|
* used as a write-through cache of the corresponding VMCS fields.
|
|
|
|
*/
|
|
|
|
struct vmcs_host_state {
|
|
|
|
unsigned long cr3; /* May not match real cr3 */
|
|
|
|
unsigned long cr4; /* May not match real cr4 */
|
|
|
|
unsigned long gs_base;
|
|
|
|
unsigned long fs_base;
|
2019-01-25 23:41:02 +08:00
|
|
|
unsigned long rsp;
|
2018-12-04 05:53:04 +08:00
|
|
|
|
|
|
|
u16 fs_sel, gs_sel, ldt_sel;
|
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
u16 ds_sel, es_sel;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
2019-05-08 03:17:58 +08:00
|
|
|
struct vmcs_controls_shadow {
|
|
|
|
u32 vm_entry;
|
|
|
|
u32 vm_exit;
|
|
|
|
u32 pin;
|
|
|
|
u32 exec;
|
|
|
|
u32 secondary_exec;
|
|
|
|
};
|
|
|
|
|
2018-12-04 05:53:04 +08:00
|
|
|
/*
|
|
|
|
* Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
|
|
|
|
* remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
|
|
|
|
* loaded on this CPU (so we can clear them if the CPU goes down).
|
|
|
|
*/
|
|
|
|
struct loaded_vmcs {
|
|
|
|
struct vmcs *vmcs;
|
|
|
|
struct vmcs *shadow_vmcs;
|
|
|
|
int cpu;
|
|
|
|
bool launched;
|
|
|
|
bool nmi_known_unmasked;
|
KVM: VMX: Leave preemption timer running when it's disabled
VMWRITEs to the major VMCS controls, pin controls included, are
deceptively expensive. CPUs with VMCS caching (Westmere and later) also
optimize away consistency checks on VM-Entry, i.e. skip consistency
checks if the relevant fields have not changed since the last successful
VM-Entry (of the cached VMCS). Because uops are a precious commodity,
uCode's dirty VMCS field tracking isn't as precise as software would
prefer. Notably, writing any of the major VMCS fields effectively marks
the entire VMCS dirty, i.e. causes the next VM-Entry to perform all
consistency checks, which consumes several hundred cycles.
As it pertains to KVM, toggling PIN_BASED_VMX_PREEMPTION_TIMER more than
doubles the latency of the next VM-Entry (and again when/if the flag is
toggled back). In a non-nested scenario, running a "standard" guest
with the preemption timer enabled, toggling the timer flag is uncommon
but not rare, e.g. roughly 1 in 10 entries. Disabling the preemption
timer can change these numbers due to its use for "immediate exits",
even when explicitly disabled by userspace.
Nested virtualization in particular is painful, as the timer flag is set
for the majority of VM-Enters, but prepare_vmcs02() initializes vmcs02's
pin controls to *clear* the flag since its the timer's final state isn't
known until vmx_vcpu_run(). I.e. the majority of nested VM-Enters end
up unnecessarily writing pin controls *twice*.
Rather than toggle the timer flag in pin controls, set the timer value
itself to the largest allowed value to put it into a "soft disabled"
state, and ignore any spurious preemption timer exits.
Sadly, the timer is a 32-bit value and so theoretically it can fire
before the head death of the universe, i.e. spurious exits are possible.
But because KVM does *not* save the timer value on VM-Exit and because
the timer runs at a slower rate than the TSC, the maximuma timer value
is still sufficiently large for KVM's purposes. E.g. on a modern CPU
with a timer that runs at 1/32 the frequency of a 2.4ghz constant-rate
TSC, the timer will fire after ~55 seconds of *uninterrupted* guest
execution. In other words, spurious VM-Exits are effectively only
possible if the host is completely tickless on the logical CPU, the
guest is not using the preemption timer, and the guest is not generating
VM-Exits for any other reason.
To be safe from bad/weird hardware, disable the preemption timer if its
maximum delay is less than ten seconds. Ten seconds is mostly arbitrary
and was selected in no small part because it's a nice round number.
For simplicity and paranoia, fall back to __kvm_request_immediate_exit()
if the preemption timer is disabled by KVM or userspace. Previously
KVM continued to use the preemption timer to force immediate exits even
when the timer was disabled by userspace. Now that KVM leaves the timer
running instead of truly disabling it, allow userspace to kill it
entirely in the unlikely event the timer (or KVM) malfunctions.
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2019-05-08 03:18:05 +08:00
|
|
|
bool hv_timer_soft_disabled;
|
2018-12-04 05:53:04 +08:00
|
|
|
/* Support for vnmi-less CPUs */
|
|
|
|
int soft_vnmi_blocked;
|
|
|
|
ktime_t entry_time;
|
|
|
|
s64 vnmi_blocked_time;
|
|
|
|
unsigned long *msr_bitmap;
|
|
|
|
struct list_head loaded_vmcss_on_cpu_link;
|
|
|
|
struct vmcs_host_state host_state;
|
2019-05-08 03:17:58 +08:00
|
|
|
struct vmcs_controls_shadow controls_shadow;
|
2018-12-04 05:53:04 +08:00
|
|
|
};
|
|
|
|
|
2020-06-09 09:45:18 +08:00
|
|
|
static inline bool is_intr_type(u32 intr_info, u32 type)
|
|
|
|
{
|
|
|
|
const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;
|
|
|
|
|
|
|
|
return (intr_info & mask) == (INTR_INFO_VALID_MASK | type);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_intr_type_n(u32 intr_info, u32 type, u8 vector)
|
|
|
|
{
|
|
|
|
const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK |
|
|
|
|
INTR_INFO_VECTOR_MASK;
|
|
|
|
|
|
|
|
return (intr_info & mask) == (INTR_INFO_VALID_MASK | type | vector);
|
|
|
|
}
|
|
|
|
|
2018-12-04 05:53:04 +08:00
|
|
|
static inline bool is_exception_n(u32 intr_info, u8 vector)
|
|
|
|
{
|
2020-06-09 09:45:18 +08:00
|
|
|
return is_intr_type_n(intr_info, INTR_TYPE_HARD_EXCEPTION, vector);
|
2018-12-04 05:53:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_debug(u32 intr_info)
|
|
|
|
{
|
|
|
|
return is_exception_n(intr_info, DB_VECTOR);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_breakpoint(u32 intr_info)
|
|
|
|
{
|
|
|
|
return is_exception_n(intr_info, BP_VECTOR);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_page_fault(u32 intr_info)
|
|
|
|
{
|
|
|
|
return is_exception_n(intr_info, PF_VECTOR);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_invalid_opcode(u32 intr_info)
|
|
|
|
{
|
|
|
|
return is_exception_n(intr_info, UD_VECTOR);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_gp_fault(u32 intr_info)
|
|
|
|
{
|
|
|
|
return is_exception_n(intr_info, GP_VECTOR);
|
|
|
|
}
|
|
|
|
|
2021-06-23 01:22:44 +08:00
|
|
|
static inline bool is_alignment_check(u32 intr_info)
|
|
|
|
{
|
|
|
|
return is_exception_n(intr_info, AC_VECTOR);
|
|
|
|
}
|
|
|
|
|
2018-12-04 05:53:04 +08:00
|
|
|
static inline bool is_machine_check(u32 intr_info)
|
|
|
|
{
|
2020-06-09 09:45:18 +08:00
|
|
|
return is_exception_n(intr_info, MC_VECTOR);
|
2018-12-04 05:53:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Undocumented: icebp/int1 */
|
|
|
|
static inline bool is_icebp(u32 intr_info)
|
|
|
|
{
|
2020-06-09 09:45:18 +08:00
|
|
|
return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
|
2018-12-04 05:53:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool is_nmi(u32 intr_info)
|
|
|
|
{
|
2020-06-09 09:45:18 +08:00
|
|
|
return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
|
2018-12-04 05:53:04 +08:00
|
|
|
}
|
|
|
|
|
2019-04-20 13:50:56 +08:00
|
|
|
static inline bool is_external_intr(u32 intr_info)
|
|
|
|
{
|
2020-06-09 09:45:18 +08:00
|
|
|
return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
|
2019-04-20 13:50:56 +08:00
|
|
|
}
|
|
|
|
|
2020-09-24 04:13:45 +08:00
|
|
|
static inline bool is_exception_with_error_code(u32 intr_info)
|
|
|
|
{
|
|
|
|
const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK;
|
|
|
|
|
|
|
|
return (intr_info & mask) == mask;
|
|
|
|
}
|
|
|
|
|
2018-12-04 05:53:04 +08:00
|
|
|
enum vmcs_field_width {
|
|
|
|
VMCS_FIELD_WIDTH_U16 = 0,
|
|
|
|
VMCS_FIELD_WIDTH_U64 = 1,
|
|
|
|
VMCS_FIELD_WIDTH_U32 = 2,
|
|
|
|
VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline int vmcs_field_width(unsigned long field)
|
|
|
|
{
|
|
|
|
if (0x1 & field) /* the *_HIGH fields are all 32 bit */
|
|
|
|
return VMCS_FIELD_WIDTH_U32;
|
|
|
|
return (field >> 13) & 0x3;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int vmcs_field_readonly(unsigned long field)
|
|
|
|
{
|
|
|
|
return (((field >> 10) & 0x3) == 1);
|
|
|
|
}
|
|
|
|
|
2021-06-19 05:46:58 +08:00
|
|
|
#define VMCS_FIELD_INDEX_SHIFT (1)
|
|
|
|
#define VMCS_FIELD_INDEX_MASK GENMASK(9, 1)
|
|
|
|
|
|
|
|
static inline unsigned int vmcs_field_index(unsigned long field)
|
|
|
|
{
|
|
|
|
return (field & VMCS_FIELD_INDEX_MASK) >> VMCS_FIELD_INDEX_SHIFT;
|
|
|
|
}
|
|
|
|
|
2018-12-04 05:53:04 +08:00
|
|
|
#endif /* __KVM_X86_VMX_VMCS_H */
|