mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2025-01-19 04:14:49 +08:00
Generic:
- selftest compilation fix for non-x86 - KVM: avoid warning on s390 in mark_page_dirty x86: - fix page write-protection bug and improve comments - use binary search to lookup the PMU event filter, add test - enable_pmu module parameter support for Intel CPUs - switch blocked_vcpu_on_cpu_lock to raw spinlock - cleanups of blocked vCPU logic - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression) - various small fixes -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmHpmT0UHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroOstggAi1VSpT43oGslQjXNDZacHEARoYQs b0XpoW7HXicGSGRMWspCmiAPdJyYTsioEACttAmXUMs7brAgHb9n/vzdlcLh1ymL rQw2YFQlfqqB1Ki1iRhNkWlH9xOECsu28WLng6ylrx51GuT/pzWRt+V3EGUFTxIT ldW9HgZg2oFJIaLjg2hQVR/8EbBf0QdsAD3KV3tyvhBlXPkyeLOMcGe9onfjZ/NE JQeW7FtKtP4SsIFt1KrJpDPjtiwFt3bRM0gfgGw7//clvtKIqt1LYXZiq4C3b7f5 tfYiC8lO2vnOoYcfeYEmvybbSsoS/CgSliZB32qkwoVvRMIl82YmxtDD+Q== =/Mak -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull more kvm updates from Paolo Bonzini: "Generic: - selftest compilation fix for non-x86 - KVM: avoid warning on s390 in mark_page_dirty x86: - fix page write-protection bug and improve comments - use binary search to lookup the PMU event filter, add test - enable_pmu module parameter support for Intel CPUs - switch blocked_vcpu_on_cpu_lock to raw spinlock - cleanups of blocked vCPU logic - partially allow KVM_SET_CPUID{,2} after KVM_RUN (5.16 regression) - various small fixes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (46 commits) docs: kvm: fix WARNINGs from api.rst selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c kvm: selftests: Do not indent with spaces kvm: selftests: sync uapi/linux/kvm.h with Linux header selftests: kvm: add amx_test to .gitignore KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops KVM: SVM: Drop AVIC's intermediate avic_set_running() helper KVM: VMX: Don't do full kick when handling posted interrupt wakeup KVM: VMX: Fold fallback path into triggering posted IRQ helper KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ KVM: VMX: Don't do full kick when triggering posted interrupt "fails" KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers ...
This commit is contained in:
commit
636b5284d8
@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header.
|
||||
The Stats Data block contains an array of 64-bit values in the same order
|
||||
as the descriptors in Descriptors block.
|
||||
|
||||
4.42 KVM_GET_XSAVE2
|
||||
------------------
|
||||
4.134 KVM_GET_XSAVE2
|
||||
--------------------
|
||||
|
||||
:Capability: KVM_CAP_XSAVE2
|
||||
:Architectures: x86
|
||||
@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as
|
||||
limit the attack surface on KVM's MSR emulation code.
|
||||
|
||||
8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID
|
||||
-----------------------------
|
||||
-------------------------------------
|
||||
|
||||
Architectures: x86
|
||||
|
||||
|
@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush)
|
||||
KVM_X86_OP_NULL(tlb_remote_flush_with_range)
|
||||
KVM_X86_OP(tlb_flush_gva)
|
||||
KVM_X86_OP(tlb_flush_guest)
|
||||
KVM_X86_OP(vcpu_pre_run)
|
||||
KVM_X86_OP(run)
|
||||
KVM_X86_OP_NULL(handle_exit)
|
||||
KVM_X86_OP_NULL(skip_emulated_instruction)
|
||||
@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
|
||||
KVM_X86_OP_NULL(request_immediate_exit)
|
||||
KVM_X86_OP(sched_in)
|
||||
KVM_X86_OP_NULL(update_cpu_dirty_logging)
|
||||
KVM_X86_OP_NULL(pre_block)
|
||||
KVM_X86_OP_NULL(post_block)
|
||||
KVM_X86_OP_NULL(vcpu_blocking)
|
||||
KVM_X86_OP_NULL(vcpu_unblocking)
|
||||
KVM_X86_OP_NULL(update_pi_irte)
|
||||
|
@ -1381,6 +1381,7 @@ struct kvm_x86_ops {
|
||||
*/
|
||||
void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
|
||||
|
||||
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
|
||||
enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
|
||||
int (*handle_exit)(struct kvm_vcpu *vcpu,
|
||||
enum exit_fastpath_completion exit_fastpath);
|
||||
@ -1454,18 +1455,6 @@ struct kvm_x86_ops {
|
||||
const struct kvm_pmu_ops *pmu_ops;
|
||||
const struct kvm_x86_nested_ops *nested_ops;
|
||||
|
||||
/*
|
||||
* Architecture specific hooks for vCPU blocking due to
|
||||
* HLT instruction.
|
||||
* Returns for .pre_block():
|
||||
* - 0 means continue to block the vCPU.
|
||||
* - 1 means we cannot block the vCPU since some event
|
||||
* happens during this period, such as, 'ON' bit in
|
||||
* posted-interrupts descriptor is set.
|
||||
*/
|
||||
int (*pre_block)(struct kvm_vcpu *vcpu);
|
||||
void (*post_block)(struct kvm_vcpu *vcpu);
|
||||
|
||||
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
|
||||
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
|
||||
|
||||
|
@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
|
||||
return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
|
||||
}
|
||||
|
||||
/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
|
||||
static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
|
||||
int nent)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *orig;
|
||||
int i;
|
||||
|
||||
if (nent != vcpu->arch.cpuid_nent)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0; i < nent; i++) {
|
||||
orig = &vcpu->arch.cpuid_entries[i];
|
||||
if (e2[i].function != orig->function ||
|
||||
e2[i].index != orig->index ||
|
||||
e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
|
||||
e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
u32 function;
|
||||
@ -145,14 +167,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
}
|
||||
|
||||
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
|
||||
static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
|
||||
struct kvm_cpuid_entry2 *entries, int nent)
|
||||
{
|
||||
u32 base = vcpu->arch.kvm_cpuid_base;
|
||||
|
||||
if (!base)
|
||||
return NULL;
|
||||
|
||||
return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
|
||||
return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
|
||||
}
|
||||
|
||||
static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
|
||||
vcpu->arch.cpuid_nent);
|
||||
}
|
||||
|
||||
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
|
||||
@ -167,11 +196,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
|
||||
vcpu->arch.pv_cpuid.features = best->eax;
|
||||
}
|
||||
|
||||
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
|
||||
static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
|
||||
int nent)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *best;
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 1, 0);
|
||||
best = cpuid_entry2_find(entries, nent, 1, 0);
|
||||
if (best) {
|
||||
/* Update OSXSAVE bit */
|
||||
if (boot_cpu_has(X86_FEATURE_XSAVE))
|
||||
@ -182,33 +212,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
|
||||
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
|
||||
}
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 7, 0);
|
||||
best = cpuid_entry2_find(entries, nent, 7, 0);
|
||||
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
|
||||
cpuid_entry_change(best, X86_FEATURE_OSPKE,
|
||||
kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
|
||||
best = cpuid_entry2_find(entries, nent, 0xD, 0);
|
||||
if (best)
|
||||
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
|
||||
|
||||
best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
|
||||
best = cpuid_entry2_find(entries, nent, 0xD, 1);
|
||||
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
|
||||
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
|
||||
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
|
||||
|
||||
best = kvm_find_kvm_cpuid_features(vcpu);
|
||||
best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
|
||||
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
|
||||
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
|
||||
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
|
||||
|
||||
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
|
||||
best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
|
||||
best = cpuid_entry2_find(entries, nent, 0x1, 0);
|
||||
if (best)
|
||||
cpuid_entry_change(best, X86_FEATURE_MWAIT,
|
||||
vcpu->arch.ia32_misc_enable_msr &
|
||||
MSR_IA32_MISC_ENABLE_MWAIT);
|
||||
}
|
||||
}
|
||||
|
||||
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
__kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
|
||||
|
||||
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
|
||||
@ -298,6 +333,22 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
|
||||
{
|
||||
int r;
|
||||
|
||||
__kvm_update_cpuid_runtime(vcpu, e2, nent);
|
||||
|
||||
/*
|
||||
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
|
||||
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
|
||||
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
|
||||
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
|
||||
* the core vCPU model on the fly. It would've been better to forbid any
|
||||
* KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
|
||||
* some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
|
||||
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
|
||||
* whether the supplied CPUID data is equal to what's already set.
|
||||
*/
|
||||
if (vcpu->arch.last_vmentry_cpu != -1)
|
||||
return kvm_cpuid_check_equal(vcpu, e2, nent);
|
||||
|
||||
r = kvm_check_cpuid(vcpu, e2, nent);
|
||||
if (r)
|
||||
return r;
|
||||
@ -307,7 +358,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
|
||||
vcpu->arch.cpuid_nent = nent;
|
||||
|
||||
kvm_update_kvm_cpuid_base(vcpu);
|
||||
kvm_update_cpuid_runtime(vcpu);
|
||||
kvm_vcpu_after_set_cpuid(vcpu);
|
||||
|
||||
return 0;
|
||||
@ -795,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
||||
perf_get_x86_pmu_capability(&cap);
|
||||
|
||||
/*
|
||||
* Only support guest architectural pmu on a host
|
||||
* with architectural pmu.
|
||||
* The guest architecture pmu is only supported if the architecture
|
||||
* pmu exists on the host and the module parameters allow it.
|
||||
*/
|
||||
if (!cap.version)
|
||||
if (!cap.version || !enable_pmu)
|
||||
memset(&cap, 0, sizeof(cap));
|
||||
|
||||
eax.split.version_id = min(cap.version, 2);
|
||||
@ -886,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
||||
--array->nent;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
|
||||
entry->ecx &= ~BIT_ULL(2);
|
||||
entry->edx = 0;
|
||||
}
|
||||
break;
|
||||
|
@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
restart_apic_timer(vcpu->arch.apic);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
|
||||
|
||||
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
|
||||
start_sw_timer(apic);
|
||||
preempt_enable();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
|
||||
|
||||
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
|
@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
|
||||
continue;
|
||||
|
||||
flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
|
||||
|
||||
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
|
||||
start, end - 1, true, flush);
|
||||
}
|
||||
@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
|
||||
}
|
||||
|
||||
/*
|
||||
* We can flush all the TLBs out of the mmu lock without TLB
|
||||
* corruption since we just change the spte from writable to
|
||||
* readonly so that we only need to care the case of changing
|
||||
* spte from present to present (changing the spte from present
|
||||
* to nonpresent will flush all the TLBs immediately), in other
|
||||
* words, the only case we care is mmu_spte_update() where we
|
||||
* have checked Host-writable | MMU-writable instead of
|
||||
* PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
|
||||
* anymore.
|
||||
* Flush TLBs if any SPTEs had to be write-protected to ensure that
|
||||
* guest writes are reflected in the dirty bitmap before the memslot
|
||||
* update completes, i.e. before enabling dirty logging is visible to
|
||||
* userspace.
|
||||
*
|
||||
* Perform the TLB flush outside the mmu_lock to reduce the amount of
|
||||
* time the lock is held. However, this does mean that another CPU can
|
||||
* now grab mmu_lock and encounter a write-protected SPTE while CPUs
|
||||
* still have a writable mapping for the associated GFN in their TLB.
|
||||
*
|
||||
* This is safe but requires KVM to be careful when making decisions
|
||||
* based on the write-protection status of an SPTE. Specifically, KVM
|
||||
* also write-protects SPTEs to monitor changes to guest page tables
|
||||
* during shadow paging, and must guarantee no CPUs can write to those
|
||||
* page before the lock is dropped. As mentioned in the previous
|
||||
* paragraph, a write-protected SPTE is no guarantee that CPU cannot
|
||||
* perform writes. So to determine if a TLB flush is truly required, KVM
|
||||
* will clear a separate software-only bit (MMU-writable) and skip the
|
||||
* flush if-and-only-if this bit was already clear.
|
||||
*
|
||||
* See DEFAULT_SPTE_MMU_WRITEABLE for more details.
|
||||
*/
|
||||
if (flush)
|
||||
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
|
||||
|
@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
|
||||
|
||||
new_spte &= ~PT_WRITABLE_MASK;
|
||||
new_spte &= ~shadow_host_writable_mask;
|
||||
new_spte &= ~shadow_mmu_writable_mask;
|
||||
|
||||
new_spte = mark_spte_for_access_track(new_spte);
|
||||
|
||||
|
@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
|
||||
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
|
||||
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
|
||||
|
||||
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
|
||||
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
|
||||
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
|
||||
|
||||
/*
|
||||
* The mask/shift to use for saving the original R/X bits when marking the PTE
|
||||
* as not-present for access tracking purposes. We do not save the W bit as the
|
||||
@ -78,6 +74,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
|
||||
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
|
||||
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
|
||||
|
||||
/*
|
||||
* *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
|
||||
* writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
|
||||
* that map guest pages in read-only memslots and read-only VMAs.
|
||||
*
|
||||
* Invariants:
|
||||
* - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
|
||||
*
|
||||
*
|
||||
* *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
|
||||
* allows writes to the guest page mapped by the SPTE. This bit is cleared when
|
||||
* the guest page mapped by the SPTE contains a page table that is being
|
||||
* monitored for shadow paging. In this case the SPTE can only be made writable
|
||||
* by unsyncing the shadow page under the mmu_lock.
|
||||
*
|
||||
* Invariants:
|
||||
* - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
|
||||
* - If MMU-writable is set, Host-writable must be set.
|
||||
*
|
||||
* If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
|
||||
* to track writes for dirty logging. For such SPTEs, KVM will locklessly set
|
||||
* PT_WRITABLE_MASK upon the next write from the guest and record the write in
|
||||
* the dirty log (see fast_page_fault()).
|
||||
*/
|
||||
|
||||
/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
|
||||
#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
|
||||
#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
|
||||
|
||||
/*
|
||||
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
|
||||
* to not overlap the A/D type mask or the saved access bits of access-tracked
|
||||
@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
|
||||
|
||||
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
|
||||
{
|
||||
return (spte & shadow_host_writable_mask) &&
|
||||
(spte & shadow_mmu_writable_mask);
|
||||
if (spte & shadow_mmu_writable_mask) {
|
||||
WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
|
||||
return true;
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline u64 get_mmio_spte_generation(u64 spte)
|
||||
|
@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||
!is_last_spte(iter.old_spte, iter.level))
|
||||
continue;
|
||||
|
||||
if (!is_writable_pte(iter.old_spte))
|
||||
break;
|
||||
|
||||
new_spte = iter.old_spte &
|
||||
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
|
||||
|
||||
if (new_spte == iter.old_spte)
|
||||
break;
|
||||
|
||||
tdp_mmu_set_spte(kvm, &iter, new_spte);
|
||||
spte_set = true;
|
||||
}
|
||||
|
@ -13,6 +13,8 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/bsearch.h>
|
||||
#include <linux/sort.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include "x86.h"
|
||||
#include "cpuid.h"
|
||||
@ -109,6 +111,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
|
||||
.config = config,
|
||||
};
|
||||
|
||||
if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
|
||||
return;
|
||||
|
||||
attr.sample_period = get_sample_period(pmc, pmc->counter);
|
||||
|
||||
if (in_tx)
|
||||
@ -169,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
|
||||
return true;
|
||||
}
|
||||
|
||||
static int cmp_u64(const void *a, const void *b)
|
||||
{
|
||||
return *(__u64 *)a - *(__u64 *)b;
|
||||
}
|
||||
|
||||
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
|
||||
{
|
||||
unsigned config, type = PERF_TYPE_RAW;
|
||||
struct kvm *kvm = pmc->vcpu->kvm;
|
||||
struct kvm_pmu_event_filter *filter;
|
||||
int i;
|
||||
bool allow_event = true;
|
||||
|
||||
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
|
||||
@ -189,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
|
||||
|
||||
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
|
||||
if (filter) {
|
||||
for (i = 0; i < filter->nevents; i++)
|
||||
if (filter->events[i] ==
|
||||
(eventsel & AMD64_RAW_EVENT_MASK_NB))
|
||||
break;
|
||||
if (filter->action == KVM_PMU_EVENT_ALLOW &&
|
||||
i == filter->nevents)
|
||||
allow_event = false;
|
||||
if (filter->action == KVM_PMU_EVENT_DENY &&
|
||||
i < filter->nevents)
|
||||
allow_event = false;
|
||||
__u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
|
||||
|
||||
if (bsearch(&key, filter->events, filter->nevents,
|
||||
sizeof(__u64), cmp_u64))
|
||||
allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
|
||||
else
|
||||
allow_event = filter->action == KVM_PMU_EVENT_DENY;
|
||||
}
|
||||
if (!allow_event)
|
||||
return;
|
||||
@ -573,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
|
||||
/* Ensure nevents can't be changed between the user copies. */
|
||||
*filter = tmp;
|
||||
|
||||
/*
|
||||
* Sort the in-kernel list so that we can search it with bsearch.
|
||||
*/
|
||||
sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
|
||||
|
||||
mutex_lock(&kvm->lock);
|
||||
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
|
||||
mutex_is_locked(&kvm->lock));
|
||||
|
@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
|
||||
struct kvm_vcpu *vcpu;
|
||||
unsigned long i;
|
||||
|
||||
/*
|
||||
* Wake any target vCPUs that are blocking, i.e. waiting for a wake
|
||||
* event. There's no need to signal doorbells, as hardware has handled
|
||||
* vCPUs that were in guest at the time of the IPI, and vCPUs that have
|
||||
* since entered the guest will have processed pending IRQs at VMRUN.
|
||||
*/
|
||||
kvm_for_each_vcpu(i, vcpu, kvm) {
|
||||
bool m = kvm_apic_match_dest(vcpu, source,
|
||||
icrl & APIC_SHORT_MASK,
|
||||
GET_APIC_DEST_FIELD(icrh),
|
||||
icrl & APIC_DEST_MASK);
|
||||
|
||||
if (m && !avic_vcpu_is_running(vcpu))
|
||||
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
|
||||
GET_APIC_DEST_FIELD(icrh),
|
||||
icrl & APIC_DEST_MASK))
|
||||
kvm_vcpu_wake_up(vcpu);
|
||||
}
|
||||
}
|
||||
@ -672,9 +675,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
|
||||
return -1;
|
||||
|
||||
kvm_lapic_set_irr(vec, vcpu->arch.apic);
|
||||
|
||||
/*
|
||||
* Pairs with the smp_mb_*() after setting vcpu->guest_mode in
|
||||
* vcpu_enter_guest() to ensure the write to the vIRR is ordered before
|
||||
* the read of guest_mode, which guarantees that either VMRUN will see
|
||||
* and process the new vIRR entry, or that the below code will signal
|
||||
* the doorbell if the vCPU is already running in the guest.
|
||||
*/
|
||||
smp_mb__after_atomic();
|
||||
|
||||
if (avic_vcpu_is_running(vcpu)) {
|
||||
/*
|
||||
* Signal the doorbell to tell hardware to inject the IRQ if the vCPU
|
||||
* is in the guest. If the vCPU is not in the guest, hardware will
|
||||
* automatically process AVIC interrupts at VMRUN.
|
||||
*/
|
||||
if (vcpu->mode == IN_GUEST_MODE) {
|
||||
int cpu = READ_ONCE(vcpu->cpu);
|
||||
|
||||
/*
|
||||
@ -688,8 +704,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
|
||||
if (cpu != get_cpu())
|
||||
wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
|
||||
put_cpu();
|
||||
} else
|
||||
} else {
|
||||
/*
|
||||
* Wake the vCPU if it was blocking. KVM will then detect the
|
||||
* pending IRQ when checking if the vCPU has a wake event.
|
||||
*/
|
||||
kvm_vcpu_wake_up(vcpu);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -957,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
int h_physical_id = kvm_cpu_get_apicid(cpu);
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
lockdep_assert_preemption_disabled();
|
||||
|
||||
/*
|
||||
* Since the host physical APIC id is 8 bits,
|
||||
* we can support host APIC ID upto 255.
|
||||
@ -964,19 +987,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
|
||||
return;
|
||||
|
||||
/*
|
||||
* No need to update anything if the vCPU is blocking, i.e. if the vCPU
|
||||
* is being scheduled in after being preempted. The CPU entries in the
|
||||
* Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
|
||||
* If the vCPU was migrated, its new CPU value will be stuffed when the
|
||||
* vCPU unblocks.
|
||||
*/
|
||||
if (kvm_vcpu_is_blocking(vcpu))
|
||||
return;
|
||||
|
||||
entry = READ_ONCE(*(svm->avic_physical_id_cache));
|
||||
WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
|
||||
|
||||
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
|
||||
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
|
||||
|
||||
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
|
||||
if (svm->avic_is_running)
|
||||
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
|
||||
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
|
||||
|
||||
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
|
||||
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
|
||||
svm->avic_is_running);
|
||||
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
|
||||
}
|
||||
|
||||
void avic_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
@ -984,42 +1013,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
u64 entry;
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
|
||||
lockdep_assert_preemption_disabled();
|
||||
|
||||
entry = READ_ONCE(*(svm->avic_physical_id_cache));
|
||||
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
|
||||
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
|
||||
|
||||
/* Nothing to do if IsRunning == '0' due to vCPU blocking. */
|
||||
if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
|
||||
return;
|
||||
|
||||
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
|
||||
|
||||
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
|
||||
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called during VCPU halt/unhalt.
|
||||
*/
|
||||
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
|
||||
void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
int cpu = get_cpu();
|
||||
if (!kvm_vcpu_apicv_active(vcpu))
|
||||
return;
|
||||
|
||||
preempt_disable();
|
||||
|
||||
/*
|
||||
* Unload the AVIC when the vCPU is about to block, _before_
|
||||
* the vCPU actually blocks.
|
||||
*
|
||||
* Any IRQs that arrive before IsRunning=0 will not cause an
|
||||
* incomplete IPI vmexit on the source, therefore vIRR will also
|
||||
* be checked by kvm_vcpu_check_block() before blocking. The
|
||||
* memory barrier implicit in set_current_state orders writing
|
||||
* IsRunning=0 before reading the vIRR. The processor needs a
|
||||
* matching memory barrier on interrupt delivery between writing
|
||||
* IRR and reading IsRunning; the lack of this barrier might be
|
||||
* the cause of errata #1235).
|
||||
*/
|
||||
avic_vcpu_put(vcpu);
|
||||
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
if (!kvm_vcpu_apicv_active(vcpu))
|
||||
return;
|
||||
|
||||
cpu = get_cpu();
|
||||
WARN_ON(cpu != vcpu->cpu);
|
||||
svm->avic_is_running = is_run;
|
||||
|
||||
if (kvm_vcpu_apicv_active(vcpu)) {
|
||||
if (is_run)
|
||||
avic_vcpu_load(vcpu, cpu);
|
||||
else
|
||||
avic_vcpu_put(vcpu);
|
||||
}
|
||||
avic_vcpu_load(vcpu, cpu);
|
||||
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
avic_set_running(vcpu, false);
|
||||
}
|
||||
|
||||
void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
|
||||
kvm_vcpu_update_apicv(vcpu);
|
||||
avic_set_running(vcpu, true);
|
||||
}
|
||||
|
@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
|
||||
{
|
||||
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
|
||||
|
||||
if (!pmu)
|
||||
if (!enable_pmu)
|
||||
return NULL;
|
||||
|
||||
switch (msr) {
|
||||
|
@ -192,10 +192,6 @@ module_param(vgif, int, 0444);
|
||||
static int lbrv = true;
|
||||
module_param(lbrv, int, 0444);
|
||||
|
||||
/* enable/disable PMU virtualization */
|
||||
bool pmu = true;
|
||||
module_param(pmu, bool, 0444);
|
||||
|
||||
static int tsc_scaling = true;
|
||||
module_param(tsc_scaling, int, 0444);
|
||||
|
||||
@ -873,47 +869,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The default MMIO mask is a single bit (excluding the present bit),
|
||||
* which could conflict with the memory encryption bit. Check for
|
||||
* memory encryption support and override the default MMIO mask if
|
||||
* memory encryption is enabled.
|
||||
*/
|
||||
static __init void svm_adjust_mmio_mask(void)
|
||||
{
|
||||
unsigned int enc_bit, mask_bit;
|
||||
u64 msr, mask;
|
||||
|
||||
/* If there is no memory encryption support, use existing mask */
|
||||
if (cpuid_eax(0x80000000) < 0x8000001f)
|
||||
return;
|
||||
|
||||
/* If memory encryption is not enabled, use existing mask */
|
||||
rdmsrl(MSR_AMD64_SYSCFG, msr);
|
||||
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
|
||||
return;
|
||||
|
||||
enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
|
||||
mask_bit = boot_cpu_data.x86_phys_bits;
|
||||
|
||||
/* Increment the mask bit if it is the same as the encryption bit */
|
||||
if (enc_bit == mask_bit)
|
||||
mask_bit++;
|
||||
|
||||
/*
|
||||
* If the mask bit location is below 52, then some bits above the
|
||||
* physical addressing limit will always be reserved, so use the
|
||||
* rsvd_bits() function to generate the mask. This mask, along with
|
||||
* the present bit, will be used to generate a page fault with
|
||||
* PFER.RSV = 1.
|
||||
*
|
||||
* If the mask bit location is 52 (or above), then clear the mask.
|
||||
*/
|
||||
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
|
||||
|
||||
kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
|
||||
}
|
||||
|
||||
static void svm_hardware_teardown(void)
|
||||
{
|
||||
int cpu;
|
||||
@ -928,198 +883,6 @@ static void svm_hardware_teardown(void)
|
||||
iopm_base = 0;
|
||||
}
|
||||
|
||||
static __init void svm_set_cpu_caps(void)
|
||||
{
|
||||
kvm_set_cpu_caps();
|
||||
|
||||
supported_xss = 0;
|
||||
|
||||
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
|
||||
if (nested) {
|
||||
kvm_cpu_cap_set(X86_FEATURE_SVM);
|
||||
|
||||
if (nrips)
|
||||
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
|
||||
|
||||
if (npt_enabled)
|
||||
kvm_cpu_cap_set(X86_FEATURE_NPT);
|
||||
|
||||
if (tsc_scaling)
|
||||
kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
|
||||
|
||||
/* Nested VM can receive #VMEXIT instead of triggering #GP */
|
||||
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
|
||||
}
|
||||
|
||||
/* CPUID 0x80000008 */
|
||||
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
|
||||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
|
||||
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
|
||||
|
||||
/* AMD PMU PERFCTR_CORE CPUID */
|
||||
if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
|
||||
kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
|
||||
|
||||
/* CPUID 0x8000001F (SME/SEV features) */
|
||||
sev_set_cpu_caps();
|
||||
}
|
||||
|
||||
static __init int svm_hardware_setup(void)
|
||||
{
|
||||
int cpu;
|
||||
struct page *iopm_pages;
|
||||
void *iopm_va;
|
||||
int r;
|
||||
unsigned int order = get_order(IOPM_SIZE);
|
||||
|
||||
/*
|
||||
* NX is required for shadow paging and for NPT if the NX huge pages
|
||||
* mitigation is enabled.
|
||||
*/
|
||||
if (!boot_cpu_has(X86_FEATURE_NX)) {
|
||||
pr_err_ratelimited("NX (Execute Disable) not supported\n");
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
kvm_enable_efer_bits(EFER_NX);
|
||||
|
||||
iopm_pages = alloc_pages(GFP_KERNEL, order);
|
||||
|
||||
if (!iopm_pages)
|
||||
return -ENOMEM;
|
||||
|
||||
iopm_va = page_address(iopm_pages);
|
||||
memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
|
||||
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
|
||||
|
||||
init_msrpm_offsets();
|
||||
|
||||
supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
|
||||
kvm_enable_efer_bits(EFER_FFXSR);
|
||||
|
||||
if (tsc_scaling) {
|
||||
if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
|
||||
tsc_scaling = false;
|
||||
} else {
|
||||
pr_info("TSC scaling supported\n");
|
||||
kvm_has_tsc_control = true;
|
||||
kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
|
||||
kvm_tsc_scaling_ratio_frac_bits = 32;
|
||||
}
|
||||
}
|
||||
|
||||
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
|
||||
|
||||
/* Check for pause filtering support */
|
||||
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
|
||||
pause_filter_count = 0;
|
||||
pause_filter_thresh = 0;
|
||||
} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
|
||||
pause_filter_thresh = 0;
|
||||
}
|
||||
|
||||
if (nested) {
|
||||
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
|
||||
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
|
||||
}
|
||||
|
||||
/*
|
||||
* KVM's MMU doesn't support using 2-level paging for itself, and thus
|
||||
* NPT isn't supported if the host is using 2-level paging since host
|
||||
* CR4 is unchanged on VMRUN.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
|
||||
npt_enabled = false;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_NPT))
|
||||
npt_enabled = false;
|
||||
|
||||
/* Force VM NPT level equal to the host's paging level */
|
||||
kvm_configure_mmu(npt_enabled, get_npt_level(),
|
||||
get_npt_level(), PG_LEVEL_1G);
|
||||
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
|
||||
|
||||
/* Note, SEV setup consumes npt_enabled. */
|
||||
sev_hardware_setup();
|
||||
|
||||
svm_hv_hardware_setup();
|
||||
|
||||
svm_adjust_mmio_mask();
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
r = svm_cpu_init(cpu);
|
||||
if (r)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (nrips) {
|
||||
if (!boot_cpu_has(X86_FEATURE_NRIPS))
|
||||
nrips = false;
|
||||
}
|
||||
|
||||
enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
|
||||
|
||||
if (enable_apicv) {
|
||||
pr_info("AVIC enabled\n");
|
||||
|
||||
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
|
||||
}
|
||||
|
||||
if (vls) {
|
||||
if (!npt_enabled ||
|
||||
!boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
|
||||
!IS_ENABLED(CONFIG_X86_64)) {
|
||||
vls = false;
|
||||
} else {
|
||||
pr_info("Virtual VMLOAD VMSAVE supported\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
|
||||
svm_gp_erratum_intercept = false;
|
||||
|
||||
if (vgif) {
|
||||
if (!boot_cpu_has(X86_FEATURE_VGIF))
|
||||
vgif = false;
|
||||
else
|
||||
pr_info("Virtual GIF supported\n");
|
||||
}
|
||||
|
||||
if (lbrv) {
|
||||
if (!boot_cpu_has(X86_FEATURE_LBRV))
|
||||
lbrv = false;
|
||||
else
|
||||
pr_info("LBR virtualization supported\n");
|
||||
}
|
||||
|
||||
if (!pmu)
|
||||
pr_info("PMU virtualization is disabled\n");
|
||||
|
||||
svm_set_cpu_caps();
|
||||
|
||||
/*
|
||||
* It seems that on AMD processors PTE's accessed bit is
|
||||
* being set by the CPU hardware before the NPF vmexit.
|
||||
* This is not expected behaviour and our tests fail because
|
||||
* of it.
|
||||
* A workaround here is to disable support for
|
||||
* GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
|
||||
* In this case userspace can know if there is support using
|
||||
* KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
|
||||
* it
|
||||
* If future AMD CPU models change the behaviour described above,
|
||||
* this variable can be changed accordingly
|
||||
*/
|
||||
allow_smaller_maxphyaddr = !npt_enabled;
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
svm_hardware_teardown();
|
||||
return r;
|
||||
}
|
||||
|
||||
static void init_seg(struct vmcb_seg *seg)
|
||||
{
|
||||
seg->selector = 0;
|
||||
@ -1444,12 +1207,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
|
||||
if (err)
|
||||
goto error_free_vmsa_page;
|
||||
|
||||
/* We initialize this flag to true to make sure that the is_running
|
||||
* bit would be set the first time the vcpu is loaded.
|
||||
*/
|
||||
if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
|
||||
svm->avic_is_running = true;
|
||||
|
||||
svm->msrpm = svm_vcpu_alloc_msrpm();
|
||||
if (!svm->msrpm) {
|
||||
err = -ENOMEM;
|
||||
@ -3833,6 +3590,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
|
||||
svm_complete_interrupts(vcpu);
|
||||
}
|
||||
|
||||
static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
|
||||
@ -4629,8 +4391,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
|
||||
.prepare_guest_switch = svm_prepare_guest_switch,
|
||||
.vcpu_load = svm_vcpu_load,
|
||||
.vcpu_put = svm_vcpu_put,
|
||||
.vcpu_blocking = svm_vcpu_blocking,
|
||||
.vcpu_unblocking = svm_vcpu_unblocking,
|
||||
.vcpu_blocking = avic_vcpu_blocking,
|
||||
.vcpu_unblocking = avic_vcpu_unblocking,
|
||||
|
||||
.update_exception_bitmap = svm_update_exception_bitmap,
|
||||
.get_msr_feature = svm_get_msr_feature,
|
||||
@ -4662,6 +4424,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
|
||||
.tlb_flush_gva = svm_flush_tlb_gva,
|
||||
.tlb_flush_guest = svm_flush_tlb,
|
||||
|
||||
.vcpu_pre_run = svm_vcpu_pre_run,
|
||||
.run = svm_vcpu_run,
|
||||
.handle_exit = handle_exit,
|
||||
.skip_emulated_instruction = skip_emulated_instruction,
|
||||
@ -4742,6 +4505,243 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
|
||||
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
|
||||
};
|
||||
|
||||
/*
|
||||
* The default MMIO mask is a single bit (excluding the present bit),
|
||||
* which could conflict with the memory encryption bit. Check for
|
||||
* memory encryption support and override the default MMIO mask if
|
||||
* memory encryption is enabled.
|
||||
*/
|
||||
static __init void svm_adjust_mmio_mask(void)
|
||||
{
|
||||
unsigned int enc_bit, mask_bit;
|
||||
u64 msr, mask;
|
||||
|
||||
/* If there is no memory encryption support, use existing mask */
|
||||
if (cpuid_eax(0x80000000) < 0x8000001f)
|
||||
return;
|
||||
|
||||
/* If memory encryption is not enabled, use existing mask */
|
||||
rdmsrl(MSR_AMD64_SYSCFG, msr);
|
||||
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
|
||||
return;
|
||||
|
||||
enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
|
||||
mask_bit = boot_cpu_data.x86_phys_bits;
|
||||
|
||||
/* Increment the mask bit if it is the same as the encryption bit */
|
||||
if (enc_bit == mask_bit)
|
||||
mask_bit++;
|
||||
|
||||
/*
|
||||
* If the mask bit location is below 52, then some bits above the
|
||||
* physical addressing limit will always be reserved, so use the
|
||||
* rsvd_bits() function to generate the mask. This mask, along with
|
||||
* the present bit, will be used to generate a page fault with
|
||||
* PFER.RSV = 1.
|
||||
*
|
||||
* If the mask bit location is 52 (or above), then clear the mask.
|
||||
*/
|
||||
mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
|
||||
|
||||
kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
|
||||
}
|
||||
|
||||
static __init void svm_set_cpu_caps(void)
|
||||
{
|
||||
kvm_set_cpu_caps();
|
||||
|
||||
supported_xss = 0;
|
||||
|
||||
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
|
||||
if (nested) {
|
||||
kvm_cpu_cap_set(X86_FEATURE_SVM);
|
||||
|
||||
if (nrips)
|
||||
kvm_cpu_cap_set(X86_FEATURE_NRIPS);
|
||||
|
||||
if (npt_enabled)
|
||||
kvm_cpu_cap_set(X86_FEATURE_NPT);
|
||||
|
||||
if (tsc_scaling)
|
||||
kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
|
||||
|
||||
/* Nested VM can receive #VMEXIT instead of triggering #GP */
|
||||
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
|
||||
}
|
||||
|
||||
/* CPUID 0x80000008 */
|
||||
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
|
||||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
|
||||
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
|
||||
|
||||
/* AMD PMU PERFCTR_CORE CPUID */
|
||||
if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
|
||||
kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
|
||||
|
||||
/* CPUID 0x8000001F (SME/SEV features) */
|
||||
sev_set_cpu_caps();
|
||||
}
|
||||
|
||||
static __init int svm_hardware_setup(void)
|
||||
{
|
||||
int cpu;
|
||||
struct page *iopm_pages;
|
||||
void *iopm_va;
|
||||
int r;
|
||||
unsigned int order = get_order(IOPM_SIZE);
|
||||
|
||||
/*
|
||||
* NX is required for shadow paging and for NPT if the NX huge pages
|
||||
* mitigation is enabled.
|
||||
*/
|
||||
if (!boot_cpu_has(X86_FEATURE_NX)) {
|
||||
pr_err_ratelimited("NX (Execute Disable) not supported\n");
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
kvm_enable_efer_bits(EFER_NX);
|
||||
|
||||
iopm_pages = alloc_pages(GFP_KERNEL, order);
|
||||
|
||||
if (!iopm_pages)
|
||||
return -ENOMEM;
|
||||
|
||||
iopm_va = page_address(iopm_pages);
|
||||
memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
|
||||
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
|
||||
|
||||
init_msrpm_offsets();
|
||||
|
||||
supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
|
||||
kvm_enable_efer_bits(EFER_FFXSR);
|
||||
|
||||
if (tsc_scaling) {
|
||||
if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
|
||||
tsc_scaling = false;
|
||||
} else {
|
||||
pr_info("TSC scaling supported\n");
|
||||
kvm_has_tsc_control = true;
|
||||
kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
|
||||
kvm_tsc_scaling_ratio_frac_bits = 32;
|
||||
}
|
||||
}
|
||||
|
||||
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
|
||||
|
||||
/* Check for pause filtering support */
|
||||
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
|
||||
pause_filter_count = 0;
|
||||
pause_filter_thresh = 0;
|
||||
} else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
|
||||
pause_filter_thresh = 0;
|
||||
}
|
||||
|
||||
if (nested) {
|
||||
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
|
||||
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
|
||||
}
|
||||
|
||||
/*
|
||||
* KVM's MMU doesn't support using 2-level paging for itself, and thus
|
||||
* NPT isn't supported if the host is using 2-level paging since host
|
||||
* CR4 is unchanged on VMRUN.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
|
||||
npt_enabled = false;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_NPT))
|
||||
npt_enabled = false;
|
||||
|
||||
/* Force VM NPT level equal to the host's paging level */
|
||||
kvm_configure_mmu(npt_enabled, get_npt_level(),
|
||||
get_npt_level(), PG_LEVEL_1G);
|
||||
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
|
||||
|
||||
/* Note, SEV setup consumes npt_enabled. */
|
||||
sev_hardware_setup();
|
||||
|
||||
svm_hv_hardware_setup();
|
||||
|
||||
svm_adjust_mmio_mask();
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
r = svm_cpu_init(cpu);
|
||||
if (r)
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (nrips) {
|
||||
if (!boot_cpu_has(X86_FEATURE_NRIPS))
|
||||
nrips = false;
|
||||
}
|
||||
|
||||
enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
|
||||
|
||||
if (enable_apicv) {
|
||||
pr_info("AVIC enabled\n");
|
||||
|
||||
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
|
||||
} else {
|
||||
svm_x86_ops.vcpu_blocking = NULL;
|
||||
svm_x86_ops.vcpu_unblocking = NULL;
|
||||
}
|
||||
|
||||
if (vls) {
|
||||
if (!npt_enabled ||
|
||||
!boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
|
||||
!IS_ENABLED(CONFIG_X86_64)) {
|
||||
vls = false;
|
||||
} else {
|
||||
pr_info("Virtual VMLOAD VMSAVE supported\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
|
||||
svm_gp_erratum_intercept = false;
|
||||
|
||||
if (vgif) {
|
||||
if (!boot_cpu_has(X86_FEATURE_VGIF))
|
||||
vgif = false;
|
||||
else
|
||||
pr_info("Virtual GIF supported\n");
|
||||
}
|
||||
|
||||
if (lbrv) {
|
||||
if (!boot_cpu_has(X86_FEATURE_LBRV))
|
||||
lbrv = false;
|
||||
else
|
||||
pr_info("LBR virtualization supported\n");
|
||||
}
|
||||
|
||||
if (!enable_pmu)
|
||||
pr_info("PMU virtualization is disabled\n");
|
||||
|
||||
svm_set_cpu_caps();
|
||||
|
||||
/*
|
||||
* It seems that on AMD processors PTE's accessed bit is
|
||||
* being set by the CPU hardware before the NPF vmexit.
|
||||
* This is not expected behaviour and our tests fail because
|
||||
* of it.
|
||||
* A workaround here is to disable support for
|
||||
* GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
|
||||
* In this case userspace can know if there is support using
|
||||
* KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
|
||||
* it
|
||||
* If future AMD CPU models change the behaviour described above,
|
||||
* this variable can be changed accordingly
|
||||
*/
|
||||
allow_smaller_maxphyaddr = !npt_enabled;
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
svm_hardware_teardown();
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
static struct kvm_x86_init_ops svm_init_ops __initdata = {
|
||||
.cpu_has_kvm_support = has_svm,
|
||||
.disabled_by_bios = is_disabled,
|
||||
|
@ -32,7 +32,6 @@
|
||||
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
|
||||
extern bool npt_enabled;
|
||||
extern bool intercept_smi;
|
||||
extern bool pmu;
|
||||
|
||||
/*
|
||||
* Clean bits in VMCB.
|
||||
@ -226,7 +225,6 @@ struct vcpu_svm {
|
||||
u32 dfr_reg;
|
||||
struct page *avic_backing_page;
|
||||
u64 *avic_physical_id_cache;
|
||||
bool avic_is_running;
|
||||
|
||||
/*
|
||||
* Per-vcpu list of struct amd_svm_iommu_ir:
|
||||
@ -574,17 +572,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
|
||||
|
||||
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
|
||||
|
||||
static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_svm *svm = to_svm(vcpu);
|
||||
u64 *entry = svm->avic_physical_id_cache;
|
||||
|
||||
if (!entry)
|
||||
return false;
|
||||
|
||||
return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
|
||||
}
|
||||
|
||||
int avic_ga_log_notifier(u32 ga_tag);
|
||||
void avic_vm_destroy(struct kvm *kvm);
|
||||
int avic_vm_init(struct kvm *kvm);
|
||||
@ -605,8 +592,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
|
||||
bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
|
||||
int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
|
||||
uint32_t guest_irq, bool set);
|
||||
void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
|
||||
void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
|
||||
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
|
||||
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
|
||||
|
||||
/* sev.c */
|
||||
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <asm/vmx.h>
|
||||
|
||||
#include "lapic.h"
|
||||
#include "x86.h"
|
||||
|
||||
extern bool __read_mostly enable_vpid;
|
||||
extern bool __read_mostly flexpriority_enabled;
|
||||
@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void)
|
||||
{
|
||||
u64 perf_cap = 0;
|
||||
|
||||
if (!enable_pmu)
|
||||
return perf_cap;
|
||||
|
||||
if (boot_cpu_has(X86_FEATURE_PDCM))
|
||||
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
|
||||
|
||||
|
@ -21,7 +21,6 @@
|
||||
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
|
||||
|
||||
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
|
||||
/* Index must match CPUID 0x0A.EBX bit vector */
|
||||
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
|
||||
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
|
||||
[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
|
||||
@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
|
||||
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
|
||||
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
|
||||
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
|
||||
/* The above index must match CPUID 0x0A.EBX bit vector */
|
||||
[7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
|
||||
};
|
||||
|
||||
@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
|
||||
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
|
||||
if (intel_arch_events[i].eventsel == event_select &&
|
||||
intel_arch_events[i].unit_mask == unit_mask &&
|
||||
(pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i)))
|
||||
break;
|
||||
for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
|
||||
if (intel_arch_events[i].eventsel != event_select ||
|
||||
intel_arch_events[i].unit_mask != unit_mask)
|
||||
continue;
|
||||
|
||||
/* disable event that reported as not present by cpuid */
|
||||
if ((i < 7) && !(pmu->available_event_types & (1 << i)))
|
||||
return PERF_COUNT_HW_MAX + 1;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(intel_arch_events))
|
||||
return PERF_COUNT_HW_MAX;
|
||||
@ -481,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
|
||||
pmu->reserved_bits = 0xffffffff00200000ull;
|
||||
|
||||
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
|
||||
if (!entry)
|
||||
if (!entry || !enable_pmu)
|
||||
return;
|
||||
eax.full = entry->eax;
|
||||
edx.full = entry->edx;
|
||||
|
@ -19,7 +19,7 @@
|
||||
* wake the target vCPUs. vCPUs are removed from the list and the notification
|
||||
* vector is reset when the vCPU is scheduled in.
|
||||
*/
|
||||
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
|
||||
static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
|
||||
/*
|
||||
* Protect the per-CPU list with a per-CPU spinlock to handle task migration.
|
||||
* When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
|
||||
@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
|
||||
* CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
|
||||
* occur if a wakeup IRQ arrives and attempts to acquire the lock.
|
||||
*/
|
||||
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
|
||||
static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
|
||||
|
||||
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
@ -51,7 +51,9 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
|
||||
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
{
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
struct pi_desc old, new;
|
||||
unsigned long flags;
|
||||
unsigned int dest;
|
||||
|
||||
/*
|
||||
@ -62,23 +64,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
if (!enable_apicv || !lapic_in_kernel(vcpu))
|
||||
return;
|
||||
|
||||
/* Nothing to do if PI.SN and PI.NDST both have the desired value. */
|
||||
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
|
||||
* PI.NDST: pi_post_block is the one expected to change PID.NDST and the
|
||||
* wakeup handler expects the vCPU to be on the blocked_vcpu_list that
|
||||
* matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
|
||||
* correctly.
|
||||
* If the vCPU wasn't on the wakeup list and wasn't migrated, then the
|
||||
* full update can be skipped as neither the vector nor the destination
|
||||
* needs to be changed.
|
||||
*/
|
||||
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
|
||||
pi_clear_sn(pi_desc);
|
||||
goto after_clear_sn;
|
||||
if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
|
||||
/*
|
||||
* Clear SN if it was set due to being preempted. Again, do
|
||||
* this even if there is no assigned device for simplicity.
|
||||
*/
|
||||
if (pi_test_and_clear_sn(pi_desc))
|
||||
goto after_clear_sn;
|
||||
return;
|
||||
}
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
/*
|
||||
* If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
|
||||
* list of the _previous_ pCPU, which will not be the same as the
|
||||
* current pCPU if the task was migrated.
|
||||
*/
|
||||
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
|
||||
raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
|
||||
list_del(&vmx->pi_wakeup_list);
|
||||
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
|
||||
}
|
||||
|
||||
/* The full case. Set the new destination and clear SN. */
|
||||
dest = cpu_physical_id(cpu);
|
||||
if (!x2apic_mode)
|
||||
dest = (dest << 8) & 0xFF00;
|
||||
@ -86,10 +99,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
do {
|
||||
old.control = new.control = READ_ONCE(pi_desc->control);
|
||||
|
||||
/*
|
||||
* Clear SN (as above) and refresh the destination APIC ID to
|
||||
* handle task migration (@cpu != vcpu->cpu).
|
||||
*/
|
||||
new.ndst = dest;
|
||||
new.sn = 0;
|
||||
|
||||
/*
|
||||
* Restore the notification vector; in the blocking case, the
|
||||
* descriptor was modified on "put" to use the wakeup vector.
|
||||
*/
|
||||
new.nv = POSTED_INTR_VECTOR;
|
||||
} while (pi_try_set_control(pi_desc, old.control, new.control));
|
||||
|
||||
local_irq_restore(flags);
|
||||
|
||||
after_clear_sn:
|
||||
|
||||
/*
|
||||
@ -111,83 +136,25 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
|
||||
irq_remapping_cap(IRQ_POSTING_CAP);
|
||||
}
|
||||
|
||||
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
|
||||
if (!vmx_can_use_vtd_pi(vcpu->kvm))
|
||||
return;
|
||||
|
||||
/* Set SN when the vCPU is preempted */
|
||||
if (vcpu->preempted)
|
||||
pi_set_sn(pi_desc);
|
||||
}
|
||||
|
||||
static void __pi_post_block(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
struct pi_desc old, new;
|
||||
unsigned int dest;
|
||||
|
||||
/*
|
||||
* Remove the vCPU from the wakeup list of the _previous_ pCPU, which
|
||||
* will not be the same as the current pCPU if the task was migrated.
|
||||
*/
|
||||
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
|
||||
list_del(&vcpu->blocked_vcpu_list);
|
||||
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
|
||||
|
||||
dest = cpu_physical_id(vcpu->cpu);
|
||||
if (!x2apic_mode)
|
||||
dest = (dest << 8) & 0xFF00;
|
||||
|
||||
WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
|
||||
"Wakeup handler not enabled while the vCPU was blocking");
|
||||
|
||||
do {
|
||||
old.control = new.control = READ_ONCE(pi_desc->control);
|
||||
|
||||
new.ndst = dest;
|
||||
|
||||
/* set 'NV' to 'notification vector' */
|
||||
new.nv = POSTED_INTR_VECTOR;
|
||||
} while (pi_try_set_control(pi_desc, old.control, new.control));
|
||||
|
||||
vcpu->pre_pcpu = -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine does the following things for vCPU which is going
|
||||
* to be blocked if VT-d PI is enabled.
|
||||
* - Store the vCPU to the wakeup list, so when interrupts happen
|
||||
* we can find the right vCPU to wake up.
|
||||
* - Change the Posted-interrupt descriptor as below:
|
||||
* 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
|
||||
* - If 'ON' is set during this process, which means at least one
|
||||
* interrupt is posted for this vCPU, we cannot block it, in
|
||||
* this case, return 1, otherwise, return 0.
|
||||
*
|
||||
* Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
|
||||
* WAKEUP as the notification vector in the PI descriptor.
|
||||
*/
|
||||
int pi_pre_block(struct kvm_vcpu *vcpu)
|
||||
static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct pi_desc old, new;
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
struct pi_desc old, new;
|
||||
unsigned long flags;
|
||||
|
||||
if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
|
||||
vmx_interrupt_blocked(vcpu))
|
||||
return 0;
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
vcpu->pre_pcpu = vcpu->cpu;
|
||||
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
|
||||
list_add_tail(&vcpu->blocked_vcpu_list,
|
||||
&per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
|
||||
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
|
||||
raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
|
||||
list_add_tail(&vmx->pi_wakeup_list,
|
||||
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
|
||||
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
|
||||
|
||||
WARN(pi_desc->sn == 1,
|
||||
"Posted Interrupt Suppress Notification set before blocking");
|
||||
WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
|
||||
|
||||
do {
|
||||
old.control = new.control = READ_ONCE(pi_desc->control);
|
||||
@ -196,24 +163,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
|
||||
new.nv = POSTED_INTR_WAKEUP_VECTOR;
|
||||
} while (pi_try_set_control(pi_desc, old.control, new.control));
|
||||
|
||||
/* We should not block the vCPU if an interrupt is posted for it. */
|
||||
if (pi_test_on(pi_desc))
|
||||
__pi_post_block(vcpu);
|
||||
/*
|
||||
* Send a wakeup IPI to this CPU if an interrupt may have been posted
|
||||
* before the notification vector was updated, in which case the IRQ
|
||||
* will arrive on the non-wakeup vector. An IPI is needed as calling
|
||||
* try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
|
||||
* enabled until it is safe to call try_to_wake_up() on the task being
|
||||
* scheduled out).
|
||||
*/
|
||||
if (pi_test_on(&new))
|
||||
apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
|
||||
|
||||
local_irq_restore(flags);
|
||||
return (vcpu->pre_pcpu == -1);
|
||||
}
|
||||
|
||||
void pi_post_block(struct kvm_vcpu *vcpu)
|
||||
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
|
||||
if (vcpu->pre_pcpu == -1)
|
||||
if (!vmx_can_use_vtd_pi(vcpu->kvm))
|
||||
return;
|
||||
|
||||
local_irq_save(flags);
|
||||
__pi_post_block(vcpu);
|
||||
local_irq_restore(flags);
|
||||
if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
|
||||
pi_enable_wakeup_handler(vcpu);
|
||||
|
||||
/*
|
||||
* Set SN when the vCPU is preempted. Note, the vCPU can both be seen
|
||||
* as blocking and preempted, e.g. if it's preempted between setting
|
||||
* its wait state and manually scheduling out.
|
||||
*/
|
||||
if (vcpu->preempted)
|
||||
pi_set_sn(pi_desc);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -221,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu)
|
||||
*/
|
||||
void pi_wakeup_handler(void)
|
||||
{
|
||||
struct kvm_vcpu *vcpu;
|
||||
int cpu = smp_processor_id();
|
||||
struct vcpu_vmx *vmx;
|
||||
|
||||
spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
|
||||
list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
|
||||
blocked_vcpu_list) {
|
||||
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
|
||||
raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
|
||||
list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
|
||||
pi_wakeup_list) {
|
||||
|
||||
if (pi_test_on(pi_desc))
|
||||
kvm_vcpu_kick(vcpu);
|
||||
if (pi_test_on(&vmx->pi_desc))
|
||||
kvm_vcpu_wake_up(&vmx->vcpu);
|
||||
}
|
||||
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
|
||||
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
|
||||
}
|
||||
|
||||
void __init pi_init_cpu(int cpu)
|
||||
{
|
||||
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
|
||||
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
|
||||
INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
|
||||
raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
|
||||
}
|
||||
|
||||
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
|
||||
@ -254,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
|
||||
* Bail out of the block loop if the VM has an assigned
|
||||
* device, but the blocking vCPU didn't reconfigure the
|
||||
* PI.NV to the wakeup vector, i.e. the assigned device
|
||||
* came along after the initial check in pi_pre_block().
|
||||
* came along after the initial check in vmx_vcpu_pi_put().
|
||||
*/
|
||||
void vmx_pi_start_assignment(struct kvm *kvm)
|
||||
{
|
||||
|
@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
|
||||
{
|
||||
return test_and_clear_bit(POSTED_INTR_SN,
|
||||
(unsigned long *)&pi_desc->control);
|
||||
}
|
||||
|
||||
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
|
||||
{
|
||||
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
|
||||
@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
|
||||
|
||||
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
|
||||
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
|
||||
int pi_pre_block(struct kvm_vcpu *vcpu);
|
||||
void pi_post_block(struct kvm_vcpu *vcpu);
|
||||
void pi_wakeup_handler(void);
|
||||
void __init pi_init_cpu(int cpu);
|
||||
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
|
||||
|
@ -3931,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
|
||||
pt_update_intercept_for_msr(vcpu);
|
||||
}
|
||||
|
||||
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
|
||||
bool nested)
|
||||
static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
|
||||
int pi_vec)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
|
||||
|
||||
if (vcpu->mode == IN_GUEST_MODE) {
|
||||
/*
|
||||
* The vector of interrupt to be delivered to vcpu had
|
||||
@ -3964,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
|
||||
*/
|
||||
|
||||
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
|
||||
return true;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
/*
|
||||
* The vCPU isn't in the guest; wake the vCPU in case it is blocking,
|
||||
* otherwise do nothing as KVM will grab the highest priority pending
|
||||
* IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
|
||||
*/
|
||||
kvm_vcpu_wake_up(vcpu);
|
||||
}
|
||||
|
||||
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
|
||||
@ -3997,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
|
||||
smp_mb__after_atomic();
|
||||
|
||||
/* the PIR and ON have been set by L1. */
|
||||
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
|
||||
kvm_vcpu_kick(vcpu);
|
||||
kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
@ -4035,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
|
||||
* guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
|
||||
* posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
|
||||
*/
|
||||
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
|
||||
kvm_vcpu_kick(vcpu);
|
||||
|
||||
kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
|
||||
return vmx->emulation_required && !vmx->rmode.vm86_active &&
|
||||
vcpu->arch.exception.pending;
|
||||
}
|
||||
|
||||
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
|
||||
if (!kvm_emulate_instruction(vcpu, 0))
|
||||
return 0;
|
||||
|
||||
if (vmx->emulation_required && !vmx->rmode.vm86_active &&
|
||||
vcpu->arch.exception.pending) {
|
||||
if (vmx_emulation_required_with_pending_exception(vcpu)) {
|
||||
kvm_prepare_emulation_failure_exit(vcpu);
|
||||
return 0;
|
||||
}
|
||||
@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (vmx_emulation_required_with_pending_exception(vcpu)) {
|
||||
kvm_prepare_emulation_failure_exit(vcpu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void grow_ple_window(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
@ -6928,6 +6945,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
|
||||
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
|
||||
vmx = to_vmx(vcpu);
|
||||
|
||||
INIT_LIST_HEAD(&vmx->pi_wakeup_list);
|
||||
|
||||
err = -ENOMEM;
|
||||
|
||||
vmx->vpid = allocate_vpid();
|
||||
@ -7549,25 +7568,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
|
||||
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
|
||||
}
|
||||
|
||||
static int vmx_pre_block(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (pi_pre_block(vcpu))
|
||||
return 1;
|
||||
|
||||
if (kvm_lapic_hv_timer_in_use(vcpu))
|
||||
kvm_lapic_switch_to_sw_timer(vcpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void vmx_post_block(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (kvm_x86_ops.set_hv_timer)
|
||||
kvm_lapic_switch_to_hv_timer(vcpu);
|
||||
|
||||
pi_post_block(vcpu);
|
||||
}
|
||||
|
||||
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
|
||||
@ -7710,6 +7710,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
|
||||
.tlb_flush_gva = vmx_flush_tlb_gva,
|
||||
.tlb_flush_guest = vmx_flush_tlb_guest,
|
||||
|
||||
.vcpu_pre_run = vmx_vcpu_pre_run,
|
||||
.run = vmx_vcpu_run,
|
||||
.handle_exit = vmx_handle_exit,
|
||||
.skip_emulated_instruction = vmx_skip_emulated_instruction,
|
||||
@ -7768,9 +7769,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
|
||||
.cpu_dirty_log_size = PML_ENTITY_NUM,
|
||||
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
|
||||
|
||||
.pre_block = vmx_pre_block,
|
||||
.post_block = vmx_post_block,
|
||||
|
||||
.pmu_ops = &intel_pmu_ops,
|
||||
.nested_ops = &vmx_nested_ops,
|
||||
|
||||
|
@ -317,6 +317,9 @@ struct vcpu_vmx {
|
||||
/* Posted interrupt descriptor */
|
||||
struct pi_desc pi_desc;
|
||||
|
||||
/* Used if this vCPU is waiting for PI notification wakeup. */
|
||||
struct list_head pi_wakeup_list;
|
||||
|
||||
/* Support for a guest hypervisor (nested VMX) */
|
||||
struct nested_vmx nested;
|
||||
|
||||
|
@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
|
||||
int __read_mostly pi_inject_timer = -1;
|
||||
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
|
||||
|
||||
/* Enable/disable PMU virtualization */
|
||||
bool __read_mostly enable_pmu = true;
|
||||
EXPORT_SYMBOL_GPL(enable_pmu);
|
||||
module_param(enable_pmu, bool, 0444);
|
||||
|
||||
/*
|
||||
* Restoring the host value for MSRs that are only consumed when running in
|
||||
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
|
||||
@ -5230,17 +5235,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
|
||||
struct kvm_cpuid __user *cpuid_arg = argp;
|
||||
struct kvm_cpuid cpuid;
|
||||
|
||||
/*
|
||||
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
|
||||
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
|
||||
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
|
||||
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
|
||||
* the core vCPU model on the fly, so fail.
|
||||
*/
|
||||
r = -EINVAL;
|
||||
if (vcpu->arch.last_vmentry_cpu != -1)
|
||||
goto out;
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
|
||||
goto out;
|
||||
@ -5251,14 +5245,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
|
||||
struct kvm_cpuid2 __user *cpuid_arg = argp;
|
||||
struct kvm_cpuid2 cpuid;
|
||||
|
||||
/*
|
||||
* KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
|
||||
* KVM_SET_CPUID case above.
|
||||
*/
|
||||
r = -EINVAL;
|
||||
if (vcpu->arch.last_vmentry_cpu != -1)
|
||||
goto out;
|
||||
|
||||
r = -EFAULT;
|
||||
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
|
||||
goto out;
|
||||
@ -9945,10 +9931,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
||||
smp_mb__after_srcu_read_unlock();
|
||||
|
||||
/*
|
||||
* This handles the case where a posted interrupt was
|
||||
* notified with kvm_vcpu_kick. Assigned devices can
|
||||
* use the POSTED_INTR_VECTOR even if APICv is disabled,
|
||||
* so do it even if APICv is disabled on this vCPU.
|
||||
* Process pending posted interrupts to handle the case where the
|
||||
* notification IRQ arrived in the host, or was never sent (because the
|
||||
* target vCPU wasn't running). Do this regardless of the vCPU's APICv
|
||||
* status, KVM doesn't update assigned devices when APICv is inhibited,
|
||||
* i.e. they can post interrupts even if APICv is temporarily disabled.
|
||||
*/
|
||||
if (kvm_lapic_enabled(vcpu))
|
||||
static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
|
||||
@ -10113,8 +10100,20 @@ out:
|
||||
|
||||
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
|
||||
{
|
||||
if (!kvm_arch_vcpu_runnable(vcpu) &&
|
||||
(!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
|
||||
bool hv_timer;
|
||||
|
||||
if (!kvm_arch_vcpu_runnable(vcpu)) {
|
||||
/*
|
||||
* Switch to the software timer before halt-polling/blocking as
|
||||
* the guest's timer may be a break event for the vCPU, and the
|
||||
* hypervisor timer runs only when the CPU is in guest mode.
|
||||
* Switch before halt-polling so that KVM recognizes an expired
|
||||
* timer before blocking.
|
||||
*/
|
||||
hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
|
||||
if (hv_timer)
|
||||
kvm_lapic_switch_to_sw_timer(vcpu);
|
||||
|
||||
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
|
||||
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
|
||||
kvm_vcpu_halt(vcpu);
|
||||
@ -10122,8 +10121,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
|
||||
kvm_vcpu_block(vcpu);
|
||||
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
|
||||
|
||||
if (kvm_x86_ops.post_block)
|
||||
static_call(kvm_x86_post_block)(vcpu);
|
||||
if (hv_timer)
|
||||
kvm_lapic_switch_to_hv_timer(vcpu);
|
||||
|
||||
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
|
||||
return 1;
|
||||
@ -10316,6 +10315,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
|
||||
r = -EINTR;
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* It should be impossible for the hypervisor timer to be in
|
||||
* use before KVM has ever run the vCPU.
|
||||
*/
|
||||
WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
|
||||
kvm_vcpu_block(vcpu);
|
||||
if (kvm_apic_accept_events(vcpu) < 0) {
|
||||
r = 0;
|
||||
@ -10360,10 +10364,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
|
||||
} else
|
||||
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
|
||||
|
||||
if (kvm_run->immediate_exit)
|
||||
if (kvm_run->immediate_exit) {
|
||||
r = -EINTR;
|
||||
else
|
||||
r = vcpu_run(vcpu);
|
||||
goto out;
|
||||
}
|
||||
|
||||
r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
|
||||
if (r <= 0)
|
||||
goto out;
|
||||
|
||||
r = vcpu_run(vcpu);
|
||||
|
||||
out:
|
||||
kvm_put_guest_fpu(vcpu);
|
||||
|
@ -336,6 +336,7 @@ extern u64 host_xcr0;
|
||||
extern u64 supported_xcr0;
|
||||
extern u64 host_xss;
|
||||
extern u64 supported_xss;
|
||||
extern bool enable_pmu;
|
||||
|
||||
static inline bool kvm_mpx_supported(void)
|
||||
{
|
||||
|
@ -309,9 +309,6 @@ struct kvm_vcpu {
|
||||
u64 requests;
|
||||
unsigned long guest_debug;
|
||||
|
||||
int pre_pcpu;
|
||||
struct list_head blocked_vcpu_list;
|
||||
|
||||
struct mutex mutex;
|
||||
struct kvm_run *run;
|
||||
|
||||
|
@ -1131,7 +1131,8 @@ struct kvm_ppc_resize_hpt {
|
||||
#define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204
|
||||
#define KVM_CAP_ARM_MTE 205
|
||||
#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206
|
||||
#define KVM_CAP_XSAVE2 207
|
||||
#define KVM_CAP_VM_GPA_BITS 207
|
||||
#define KVM_CAP_XSAVE2 208
|
||||
|
||||
#ifdef KVM_CAP_IRQ_ROUTING
|
||||
|
||||
@ -1163,11 +1164,20 @@ struct kvm_irq_routing_hv_sint {
|
||||
__u32 sint;
|
||||
};
|
||||
|
||||
struct kvm_irq_routing_xen_evtchn {
|
||||
__u32 port;
|
||||
__u32 vcpu;
|
||||
__u32 priority;
|
||||
};
|
||||
|
||||
#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1))
|
||||
|
||||
/* gsi routing entry types */
|
||||
#define KVM_IRQ_ROUTING_IRQCHIP 1
|
||||
#define KVM_IRQ_ROUTING_MSI 2
|
||||
#define KVM_IRQ_ROUTING_S390_ADAPTER 3
|
||||
#define KVM_IRQ_ROUTING_HV_SINT 4
|
||||
#define KVM_IRQ_ROUTING_XEN_EVTCHN 5
|
||||
|
||||
struct kvm_irq_routing_entry {
|
||||
__u32 gsi;
|
||||
@ -1179,6 +1189,7 @@ struct kvm_irq_routing_entry {
|
||||
struct kvm_irq_routing_msi msi;
|
||||
struct kvm_irq_routing_s390_adapter adapter;
|
||||
struct kvm_irq_routing_hv_sint hv_sint;
|
||||
struct kvm_irq_routing_xen_evtchn xen_evtchn;
|
||||
__u32 pad[8];
|
||||
} u;
|
||||
};
|
||||
@ -1209,6 +1220,7 @@ struct kvm_x86_mce {
|
||||
#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
|
||||
#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
|
||||
#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
|
||||
#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
|
||||
|
||||
struct kvm_xen_hvm_config {
|
||||
__u32 flags;
|
||||
@ -1552,8 +1564,6 @@ struct kvm_s390_ucas_mapping {
|
||||
/* Available with KVM_CAP_XSAVE */
|
||||
#define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave)
|
||||
#define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave)
|
||||
/* Available with KVM_CAP_XSAVE2 */
|
||||
#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
|
||||
/* Available with KVM_CAP_XCRS */
|
||||
#define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs)
|
||||
#define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs)
|
||||
@ -1613,6 +1623,9 @@ struct kvm_enc_region {
|
||||
#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3)
|
||||
#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4)
|
||||
|
||||
/* Available with KVM_CAP_XSAVE2 */
|
||||
#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave)
|
||||
|
||||
struct kvm_s390_pv_sec_parm {
|
||||
__u64 origin;
|
||||
__u64 length;
|
||||
|
5
tools/testing/selftests/kvm/.gitignore
vendored
5
tools/testing/selftests/kvm/.gitignore
vendored
@ -8,11 +8,12 @@
|
||||
/s390x/memop
|
||||
/s390x/resets
|
||||
/s390x/sync_regs_test
|
||||
/x86_64/amx_test
|
||||
/x86_64/cpuid_test
|
||||
/x86_64/cr4_cpuid_sync_test
|
||||
/x86_64/debug_regs
|
||||
/x86_64/evmcs_test
|
||||
/x86_64/emulator_error_test
|
||||
/x86_64/get_cpuid_test
|
||||
/x86_64/get_msr_index_features
|
||||
/x86_64/kvm_clock_test
|
||||
/x86_64/kvm_pv_test
|
||||
@ -22,6 +23,7 @@
|
||||
/x86_64/mmio_warning_test
|
||||
/x86_64/mmu_role_test
|
||||
/x86_64/platform_info_test
|
||||
/x86_64/pmu_event_filter_test
|
||||
/x86_64/set_boot_cpu_id
|
||||
/x86_64/set_sregs_test
|
||||
/x86_64/sev_migrate_tests
|
||||
@ -36,6 +38,7 @@
|
||||
/x86_64/vmx_apic_access_test
|
||||
/x86_64/vmx_close_while_nested_test
|
||||
/x86_64/vmx_dirty_log_test
|
||||
/x86_64/vmx_exception_with_invalid_guest_state
|
||||
/x86_64/vmx_invalid_nested_guest_state
|
||||
/x86_64/vmx_preemption_timer_test
|
||||
/x86_64/vmx_set_nested_state_test
|
||||
|
@ -43,11 +43,11 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handler
|
||||
LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c
|
||||
LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c
|
||||
|
||||
TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
|
||||
TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
|
||||
@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/smm_test
|
||||
@ -69,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
|
||||
TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
|
||||
|
@ -364,6 +364,24 @@ static inline unsigned long get_xmm(int n)
|
||||
}
|
||||
|
||||
bool is_intel_cpu(void);
|
||||
bool is_amd_cpu(void);
|
||||
|
||||
static inline unsigned int x86_family(unsigned int eax)
|
||||
{
|
||||
unsigned int x86;
|
||||
|
||||
x86 = (eax >> 8) & 0xf;
|
||||
|
||||
if (x86 == 0xf)
|
||||
x86 += (eax >> 20) & 0xff;
|
||||
|
||||
return x86;
|
||||
}
|
||||
|
||||
static inline unsigned int x86_model(unsigned int eax)
|
||||
{
|
||||
return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
|
||||
}
|
||||
|
||||
struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
|
||||
void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
|
||||
@ -375,6 +393,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index);
|
||||
struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
|
||||
|
||||
struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid);
|
||||
int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
|
||||
struct kvm_cpuid2 *cpuid);
|
||||
void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
|
||||
struct kvm_cpuid2 *cpuid);
|
||||
|
||||
@ -418,6 +438,11 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr);
|
||||
void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr,
|
||||
uint64_t pte);
|
||||
|
||||
/*
|
||||
* get_cpuid() - find matching CPUID entry and return pointer to it.
|
||||
*/
|
||||
struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
|
||||
uint32_t index);
|
||||
/*
|
||||
* set_cpuid() - overwrites a matching cpuid entry with the provided value.
|
||||
* matches based on ent->function && ent->index. returns true
|
||||
|
@ -393,10 +393,12 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
|
||||
struct kvm_vm *vm;
|
||||
int i;
|
||||
|
||||
#ifdef __x86_64__
|
||||
/*
|
||||
* Permission needs to be requested before KVM_SET_CPUID2.
|
||||
*/
|
||||
vm_xsave_req_perm();
|
||||
#endif
|
||||
|
||||
/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
|
||||
if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
|
||||
@ -497,9 +499,11 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
|
||||
void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
|
||||
uint64_t first_page, uint32_t num_pages)
|
||||
{
|
||||
struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
|
||||
.first_page = first_page,
|
||||
.num_pages = num_pages };
|
||||
struct kvm_clear_dirty_log args = {
|
||||
.dirty_bitmap = log, .slot = slot,
|
||||
.first_page = first_page,
|
||||
.num_pages = num_pages
|
||||
};
|
||||
int ret;
|
||||
|
||||
ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
|
||||
|
@ -886,6 +886,17 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
|
||||
struct kvm_cpuid2 *cpuid)
|
||||
{
|
||||
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
|
||||
|
||||
TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
|
||||
|
||||
return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
|
||||
}
|
||||
|
||||
/*
|
||||
* VM VCPU CPUID Set
|
||||
*
|
||||
@ -903,12 +914,9 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
|
||||
void vcpu_set_cpuid(struct kvm_vm *vm,
|
||||
uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
|
||||
{
|
||||
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
|
||||
int rc;
|
||||
|
||||
TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
|
||||
|
||||
rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
|
||||
rc = __vcpu_set_cpuid(vm, vcpuid, cpuid);
|
||||
TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
|
||||
rc, errno);
|
||||
|
||||
@ -1136,25 +1144,25 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
|
||||
list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
|
||||
list->nmsrs = nmsrs;
|
||||
r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
|
||||
r);
|
||||
|
||||
state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
|
||||
r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
|
||||
r);
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
|
||||
r);
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
|
||||
r);
|
||||
|
||||
r = vcpu_save_xsave_state(vm, vcpu, state);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
|
||||
r);
|
||||
|
||||
if (kvm_check_cap(KVM_CAP_XCRS)) {
|
||||
r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
|
||||
@ -1163,17 +1171,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
|
||||
}
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
|
||||
r);
|
||||
|
||||
if (nested_size) {
|
||||
state->nested.size = sizeof(state->nested_);
|
||||
r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
|
||||
r);
|
||||
r);
|
||||
TEST_ASSERT(state->nested.size <= nested_size,
|
||||
"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
|
||||
state->nested.size, nested_size);
|
||||
"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
|
||||
state->nested.size, nested_size);
|
||||
} else
|
||||
state->nested.size = 0;
|
||||
|
||||
@ -1181,12 +1189,12 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
|
||||
for (i = 0; i < nmsrs; i++)
|
||||
state->msrs.entries[i].index = list->indices[i];
|
||||
r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
|
||||
TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
|
||||
r, r == nmsrs ? -1 : list->indices[r]);
|
||||
TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
|
||||
r, r == nmsrs ? -1 : list->indices[r]);
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
|
||||
r);
|
||||
|
||||
free(list);
|
||||
return state;
|
||||
@ -1199,7 +1207,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
|
||||
r);
|
||||
r);
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
|
||||
TEST_ASSERT(r == state->msrs.nmsrs,
|
||||
@ -1214,28 +1222,28 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
|
||||
r);
|
||||
r);
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
|
||||
r);
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
|
||||
r);
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
|
||||
r);
|
||||
|
||||
r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
|
||||
r);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
|
||||
r);
|
||||
|
||||
if (state->nested.size) {
|
||||
r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
|
||||
TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
|
||||
r);
|
||||
r);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1245,10 +1253,10 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state)
|
||||
free(state);
|
||||
}
|
||||
|
||||
bool is_intel_cpu(void)
|
||||
static bool cpu_vendor_string_is(const char *vendor)
|
||||
{
|
||||
const uint32_t *chunk = (const uint32_t *)vendor;
|
||||
int eax, ebx, ecx, edx;
|
||||
const uint32_t *chunk;
|
||||
const int leaf = 0;
|
||||
|
||||
__asm__ __volatile__(
|
||||
@ -1257,10 +1265,22 @@ bool is_intel_cpu(void)
|
||||
"=c"(ecx), "=d"(edx)
|
||||
: /* input */ "0"(leaf), "2"(0));
|
||||
|
||||
chunk = (const uint32_t *)("GenuineIntel");
|
||||
return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
|
||||
}
|
||||
|
||||
bool is_intel_cpu(void)
|
||||
{
|
||||
return cpu_vendor_string_is("GenuineIntel");
|
||||
}
|
||||
|
||||
/*
|
||||
* Exclude early K5 samples with a vendor string of "AMDisbetter!"
|
||||
*/
|
||||
bool is_amd_cpu(void)
|
||||
{
|
||||
return cpu_vendor_string_is("AuthenticAMD");
|
||||
}
|
||||
|
||||
uint32_t kvm_get_cpuid_max_basic(void)
|
||||
{
|
||||
return kvm_get_supported_cpuid_entry(0)->eax;
|
||||
@ -1384,6 +1404,23 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
|
||||
}
|
||||
}
|
||||
|
||||
struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function,
|
||||
uint32_t index)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < cpuid->nent; i++) {
|
||||
struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
|
||||
|
||||
if (cur->function == function && cur->index == index)
|
||||
return cur;
|
||||
}
|
||||
|
||||
TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool set_cpuid(struct kvm_cpuid2 *cpuid,
|
||||
struct kvm_cpuid_entry2 *ent)
|
||||
{
|
||||
@ -1479,22 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
|
||||
return cpuid;
|
||||
}
|
||||
|
||||
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
|
||||
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
|
||||
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
|
||||
|
||||
static inline unsigned x86_family(unsigned int eax)
|
||||
{
|
||||
unsigned int x86;
|
||||
|
||||
x86 = (eax >> 8) & 0xf;
|
||||
|
||||
if (x86 == 0xf)
|
||||
x86 += (eax >> 20) & 0xff;
|
||||
|
||||
return x86;
|
||||
}
|
||||
|
||||
unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
|
||||
{
|
||||
const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
|
||||
@ -1504,11 +1525,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
|
||||
max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
|
||||
|
||||
/* Avoid reserved HyperTransport region on AMD processors. */
|
||||
eax = ecx = 0;
|
||||
cpuid(&eax, &ebx, &ecx, &edx);
|
||||
if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
|
||||
ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
|
||||
edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
|
||||
if (!is_amd_cpu())
|
||||
return max_gfn;
|
||||
|
||||
/* On parts with <40 physical address bits, the area is fully hidden */
|
||||
@ -1518,6 +1535,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
|
||||
/* Before family 17h, the HyperTransport area is just below 1T. */
|
||||
ht_gfn = (1 << 28) - num_ht_pages;
|
||||
eax = 1;
|
||||
ecx = 0;
|
||||
cpuid(&eax, &ebx, &ecx, &edx);
|
||||
if (x86_family(eax) < 0x17)
|
||||
goto done;
|
||||
|
@ -154,6 +154,34 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct
|
||||
return guest_cpuids;
|
||||
}
|
||||
|
||||
static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *ent;
|
||||
int rc;
|
||||
u32 eax, ebx, x;
|
||||
|
||||
/* Setting unmodified CPUID is allowed */
|
||||
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
|
||||
TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc);
|
||||
|
||||
/* Changing CPU features is forbidden */
|
||||
ent = get_cpuid(cpuid, 0x7, 0);
|
||||
ebx = ent->ebx;
|
||||
ent->ebx--;
|
||||
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
|
||||
TEST_ASSERT(rc, "Changing CPU features should fail");
|
||||
ent->ebx = ebx;
|
||||
|
||||
/* Changing MAXPHYADDR is forbidden */
|
||||
ent = get_cpuid(cpuid, 0x80000008, 0);
|
||||
eax = ent->eax;
|
||||
x = eax & 0xff;
|
||||
ent->eax = (eax & ~0xffu) | (x - 1);
|
||||
rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid);
|
||||
TEST_ASSERT(rc, "Changing MAXPHYADDR should fail");
|
||||
ent->eax = eax;
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
struct kvm_cpuid2 *supp_cpuid, *cpuid2;
|
||||
@ -175,5 +203,7 @@ int main(void)
|
||||
for (stage = 0; stage < 3; stage++)
|
||||
run_vcpu(vm, VCPU_ID, stage);
|
||||
|
||||
set_cpuid_after_run(vm, cpuid2);
|
||||
|
||||
kvm_vm_free(vm);
|
||||
}
|
434
tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
Normal file
434
tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
Normal file
@ -0,0 +1,434 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Test for x86 KVM_SET_PMU_EVENT_FILTER.
|
||||
*
|
||||
* Copyright (C) 2022, Google LLC.
|
||||
*
|
||||
* This work is licensed under the terms of the GNU GPL, version 2.
|
||||
*
|
||||
* Verifies the expected behavior of allow lists and deny lists for
|
||||
* virtual PMU events.
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE /* for program_invocation_short_name */
|
||||
#include "test_util.h"
|
||||
#include "kvm_util.h"
|
||||
#include "processor.h"
|
||||
|
||||
/*
|
||||
* In lieu of copying perf_event.h into tools...
|
||||
*/
|
||||
#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17)
|
||||
#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22)
|
||||
|
||||
union cpuid10_eax {
|
||||
struct {
|
||||
unsigned int version_id:8;
|
||||
unsigned int num_counters:8;
|
||||
unsigned int bit_width:8;
|
||||
unsigned int mask_length:8;
|
||||
} split;
|
||||
unsigned int full;
|
||||
};
|
||||
|
||||
union cpuid10_ebx {
|
||||
struct {
|
||||
unsigned int no_unhalted_core_cycles:1;
|
||||
unsigned int no_instructions_retired:1;
|
||||
unsigned int no_unhalted_reference_cycles:1;
|
||||
unsigned int no_llc_reference:1;
|
||||
unsigned int no_llc_misses:1;
|
||||
unsigned int no_branch_instruction_retired:1;
|
||||
unsigned int no_branch_misses_retired:1;
|
||||
} split;
|
||||
unsigned int full;
|
||||
};
|
||||
|
||||
/* End of stuff taken from perf_event.h. */
|
||||
|
||||
/* Oddly, this isn't in perf_event.h. */
|
||||
#define ARCH_PERFMON_BRANCHES_RETIRED 5
|
||||
|
||||
#define VCPU_ID 0
|
||||
#define NUM_BRANCHES 42
|
||||
|
||||
/*
|
||||
* This is how the event selector and unit mask are stored in an AMD
|
||||
* core performance event-select register. Intel's format is similar,
|
||||
* but the event selector is only 8 bits.
|
||||
*/
|
||||
#define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \
|
||||
(umask & 0xff) << 8)
|
||||
|
||||
/*
|
||||
* "Branch instructions retired", from the Intel SDM, volume 3,
|
||||
* "Pre-defined Architectural Performance Events."
|
||||
*/
|
||||
|
||||
#define INTEL_BR_RETIRED EVENT(0xc4, 0)
|
||||
|
||||
/*
|
||||
* "Retired branch instructions", from Processor Programming Reference
|
||||
* (PPR) for AMD Family 17h Model 01h, Revision B1 Processors,
|
||||
* Preliminary Processor Programming Reference (PPR) for AMD Family
|
||||
* 17h Model 31h, Revision B0 Processors, and Preliminary Processor
|
||||
* Programming Reference (PPR) for AMD Family 19h Model 01h, Revision
|
||||
* B1 Processors Volume 1 of 2.
|
||||
*/
|
||||
|
||||
#define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0)
|
||||
|
||||
/*
|
||||
* This event list comprises Intel's eight architectural events plus
|
||||
* AMD's "retired branch instructions" for Zen[123] (and possibly
|
||||
* other AMD CPUs).
|
||||
*/
|
||||
static const uint64_t event_list[] = {
|
||||
EVENT(0x3c, 0),
|
||||
EVENT(0xc0, 0),
|
||||
EVENT(0x3c, 1),
|
||||
EVENT(0x2e, 0x4f),
|
||||
EVENT(0x2e, 0x41),
|
||||
EVENT(0xc4, 0),
|
||||
EVENT(0xc5, 0),
|
||||
EVENT(0xa4, 1),
|
||||
AMD_ZEN_BR_RETIRED,
|
||||
};
|
||||
|
||||
/*
|
||||
* If we encounter a #GP during the guest PMU sanity check, then the guest
|
||||
* PMU is not functional. Inform the hypervisor via GUEST_SYNC(0).
|
||||
*/
|
||||
static void guest_gp_handler(struct ex_regs *regs)
|
||||
{
|
||||
GUEST_SYNC(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that we can write a new value to the given MSR and read it back.
|
||||
* The caller should provide a non-empty set of bits that are safe to flip.
|
||||
*
|
||||
* Return on success. GUEST_SYNC(0) on error.
|
||||
*/
|
||||
static void check_msr(uint32_t msr, uint64_t bits_to_flip)
|
||||
{
|
||||
uint64_t v = rdmsr(msr) ^ bits_to_flip;
|
||||
|
||||
wrmsr(msr, v);
|
||||
if (rdmsr(msr) != v)
|
||||
GUEST_SYNC(0);
|
||||
|
||||
v ^= bits_to_flip;
|
||||
wrmsr(msr, v);
|
||||
if (rdmsr(msr) != v)
|
||||
GUEST_SYNC(0);
|
||||
}
|
||||
|
||||
static void intel_guest_code(void)
|
||||
{
|
||||
check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
|
||||
check_msr(MSR_P6_EVNTSEL0, 0xffff);
|
||||
check_msr(MSR_IA32_PMC0, 0xffff);
|
||||
GUEST_SYNC(1);
|
||||
|
||||
for (;;) {
|
||||
uint64_t br0, br1;
|
||||
|
||||
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0);
|
||||
wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
|
||||
ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED);
|
||||
wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1);
|
||||
br0 = rdmsr(MSR_IA32_PMC0);
|
||||
__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
|
||||
br1 = rdmsr(MSR_IA32_PMC0);
|
||||
GUEST_SYNC(br1 - br0);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23],
|
||||
* this code uses the always-available, legacy K7 PMU MSRs, which alias to
|
||||
* the first four of the six extended core PMU MSRs.
|
||||
*/
|
||||
static void amd_guest_code(void)
|
||||
{
|
||||
check_msr(MSR_K7_EVNTSEL0, 0xffff);
|
||||
check_msr(MSR_K7_PERFCTR0, 0xffff);
|
||||
GUEST_SYNC(1);
|
||||
|
||||
for (;;) {
|
||||
uint64_t br0, br1;
|
||||
|
||||
wrmsr(MSR_K7_EVNTSEL0, 0);
|
||||
wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE |
|
||||
ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED);
|
||||
br0 = rdmsr(MSR_K7_PERFCTR0);
|
||||
__asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES}));
|
||||
br1 = rdmsr(MSR_K7_PERFCTR0);
|
||||
GUEST_SYNC(br1 - br0);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Run the VM to the next GUEST_SYNC(value), and return the value passed
|
||||
* to the sync. Any other exit from the guest is fatal.
|
||||
*/
|
||||
static uint64_t run_vm_to_sync(struct kvm_vm *vm)
|
||||
{
|
||||
struct kvm_run *run = vcpu_state(vm, VCPU_ID);
|
||||
struct ucall uc;
|
||||
|
||||
vcpu_run(vm, VCPU_ID);
|
||||
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
|
||||
"Exit_reason other than KVM_EXIT_IO: %u (%s)\n",
|
||||
run->exit_reason,
|
||||
exit_reason_str(run->exit_reason));
|
||||
get_ucall(vm, VCPU_ID, &uc);
|
||||
TEST_ASSERT(uc.cmd == UCALL_SYNC,
|
||||
"Received ucall other than UCALL_SYNC: %lu", uc.cmd);
|
||||
return uc.args[1];
|
||||
}
|
||||
|
||||
/*
|
||||
* In a nested environment or if the vPMU is disabled, the guest PMU
|
||||
* might not work as architected (accessing the PMU MSRs may raise
|
||||
* #GP, or writes could simply be discarded). In those situations,
|
||||
* there is no point in running these tests. The guest code will perform
|
||||
* a sanity check and then GUEST_SYNC(success). In the case of failure,
|
||||
* the behavior of the guest on resumption is undefined.
|
||||
*/
|
||||
static bool sanity_check_pmu(struct kvm_vm *vm)
|
||||
{
|
||||
bool success;
|
||||
|
||||
vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
|
||||
success = run_vm_to_sync(vm);
|
||||
vm_install_exception_handler(vm, GP_VECTOR, NULL);
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents)
|
||||
{
|
||||
struct kvm_pmu_event_filter *f;
|
||||
int size = sizeof(*f) + nevents * sizeof(f->events[0]);
|
||||
|
||||
f = malloc(size);
|
||||
TEST_ASSERT(f, "Out of memory");
|
||||
memset(f, 0, size);
|
||||
f->nevents = nevents;
|
||||
return f;
|
||||
}
|
||||
|
||||
static struct kvm_pmu_event_filter *event_filter(uint32_t action)
|
||||
{
|
||||
struct kvm_pmu_event_filter *f;
|
||||
int i;
|
||||
|
||||
f = make_pmu_event_filter(ARRAY_SIZE(event_list));
|
||||
f->action = action;
|
||||
for (i = 0; i < ARRAY_SIZE(event_list); i++)
|
||||
f->events[i] = event_list[i];
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the first occurrence of 'event' (if any) from the filter's
|
||||
* event list.
|
||||
*/
|
||||
static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f,
|
||||
uint64_t event)
|
||||
{
|
||||
bool found = false;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < f->nevents; i++) {
|
||||
if (found)
|
||||
f->events[i - 1] = f->events[i];
|
||||
else
|
||||
found = f->events[i] == event;
|
||||
}
|
||||
if (found)
|
||||
f->nevents--;
|
||||
return f;
|
||||
}
|
||||
|
||||
static void test_without_filter(struct kvm_vm *vm)
|
||||
{
|
||||
uint64_t count = run_vm_to_sync(vm);
|
||||
|
||||
if (count != NUM_BRANCHES)
|
||||
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
|
||||
__func__, count, NUM_BRANCHES);
|
||||
TEST_ASSERT(count, "Allowed PMU event is not counting");
|
||||
}
|
||||
|
||||
static uint64_t test_with_filter(struct kvm_vm *vm,
|
||||
struct kvm_pmu_event_filter *f)
|
||||
{
|
||||
vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f);
|
||||
return run_vm_to_sync(vm);
|
||||
}
|
||||
|
||||
static void test_member_deny_list(struct kvm_vm *vm)
|
||||
{
|
||||
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
|
||||
uint64_t count = test_with_filter(vm, f);
|
||||
|
||||
free(f);
|
||||
if (count)
|
||||
pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
|
||||
__func__, count);
|
||||
TEST_ASSERT(!count, "Disallowed PMU Event is counting");
|
||||
}
|
||||
|
||||
static void test_member_allow_list(struct kvm_vm *vm)
|
||||
{
|
||||
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
|
||||
uint64_t count = test_with_filter(vm, f);
|
||||
|
||||
free(f);
|
||||
if (count != NUM_BRANCHES)
|
||||
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
|
||||
__func__, count, NUM_BRANCHES);
|
||||
TEST_ASSERT(count, "Allowed PMU event is not counting");
|
||||
}
|
||||
|
||||
static void test_not_member_deny_list(struct kvm_vm *vm)
|
||||
{
|
||||
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY);
|
||||
uint64_t count;
|
||||
|
||||
remove_event(f, INTEL_BR_RETIRED);
|
||||
remove_event(f, AMD_ZEN_BR_RETIRED);
|
||||
count = test_with_filter(vm, f);
|
||||
free(f);
|
||||
if (count != NUM_BRANCHES)
|
||||
pr_info("%s: Branch instructions retired = %lu (expected %u)\n",
|
||||
__func__, count, NUM_BRANCHES);
|
||||
TEST_ASSERT(count, "Allowed PMU event is not counting");
|
||||
}
|
||||
|
||||
static void test_not_member_allow_list(struct kvm_vm *vm)
|
||||
{
|
||||
struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW);
|
||||
uint64_t count;
|
||||
|
||||
remove_event(f, INTEL_BR_RETIRED);
|
||||
remove_event(f, AMD_ZEN_BR_RETIRED);
|
||||
count = test_with_filter(vm, f);
|
||||
free(f);
|
||||
if (count)
|
||||
pr_info("%s: Branch instructions retired = %lu (expected 0)\n",
|
||||
__func__, count);
|
||||
TEST_ASSERT(!count, "Disallowed PMU Event is counting");
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for a non-zero PMU version, at least one general-purpose
|
||||
* counter per logical processor, an EBX bit vector of length greater
|
||||
* than 5, and EBX[5] clear.
|
||||
*/
|
||||
static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry)
|
||||
{
|
||||
union cpuid10_eax eax = { .full = entry->eax };
|
||||
union cpuid10_ebx ebx = { .full = entry->ebx };
|
||||
|
||||
return eax.split.version_id && eax.split.num_counters > 0 &&
|
||||
eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED &&
|
||||
!ebx.split.no_branch_instruction_retired;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note that CPUID leaf 0xa is Intel-specific. This leaf should be
|
||||
* clear on AMD hardware.
|
||||
*/
|
||||
static bool use_intel_pmu(void)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *entry;
|
||||
|
||||
entry = kvm_get_supported_cpuid_index(0xa, 0);
|
||||
return is_intel_cpu() && entry && check_intel_pmu_leaf(entry);
|
||||
}
|
||||
|
||||
static bool is_zen1(uint32_t eax)
|
||||
{
|
||||
return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f;
|
||||
}
|
||||
|
||||
static bool is_zen2(uint32_t eax)
|
||||
{
|
||||
return x86_family(eax) == 0x17 &&
|
||||
x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f;
|
||||
}
|
||||
|
||||
static bool is_zen3(uint32_t eax)
|
||||
{
|
||||
return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determining AMD support for a PMU event requires consulting the AMD
|
||||
* PPR for the CPU or reference material derived therefrom. The AMD
|
||||
* test code herein has been verified to work on Zen1, Zen2, and Zen3.
|
||||
*
|
||||
* Feel free to add more AMD CPUs that are documented to support event
|
||||
* select 0xc2 umask 0 as "retired branch instructions."
|
||||
*/
|
||||
static bool use_amd_pmu(void)
|
||||
{
|
||||
struct kvm_cpuid_entry2 *entry;
|
||||
|
||||
entry = kvm_get_supported_cpuid_index(1, 0);
|
||||
return is_amd_cpu() && entry &&
|
||||
(is_zen1(entry->eax) ||
|
||||
is_zen2(entry->eax) ||
|
||||
is_zen3(entry->eax));
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
void (*guest_code)(void) = NULL;
|
||||
struct kvm_vm *vm;
|
||||
int r;
|
||||
|
||||
/* Tell stdout not to buffer its content */
|
||||
setbuf(stdout, NULL);
|
||||
|
||||
r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER);
|
||||
if (!r) {
|
||||
print_skip("KVM_CAP_PMU_EVENT_FILTER not supported");
|
||||
exit(KSFT_SKIP);
|
||||
}
|
||||
|
||||
if (use_intel_pmu())
|
||||
guest_code = intel_guest_code;
|
||||
else if (use_amd_pmu())
|
||||
guest_code = amd_guest_code;
|
||||
|
||||
if (!guest_code) {
|
||||
print_skip("Don't know how to test this guest PMU");
|
||||
exit(KSFT_SKIP);
|
||||
}
|
||||
|
||||
vm = vm_create_default(VCPU_ID, 0, guest_code);
|
||||
|
||||
vm_init_descriptor_tables(vm);
|
||||
vcpu_init_descriptor_tables(vm, VCPU_ID);
|
||||
|
||||
if (!sanity_check_pmu(vm)) {
|
||||
print_skip("Guest PMU is not functional");
|
||||
exit(KSFT_SKIP);
|
||||
}
|
||||
|
||||
test_without_filter(vm);
|
||||
test_member_deny_list(vm);
|
||||
test_member_allow_list(vm);
|
||||
test_not_member_deny_list(vm);
|
||||
test_not_member_allow_list(vm);
|
||||
|
||||
kvm_vm_free(vm);
|
||||
|
||||
return 0;
|
||||
}
|
@ -77,8 +77,8 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
|
||||
switch (get_ucall(vm, vcpuid, &uc)) {
|
||||
case UCALL_SYNC:
|
||||
TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
|
||||
uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
|
||||
stage + 1, (ulong)uc.args[1]);
|
||||
uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
|
||||
stage + 1, (ulong)uc.args[1]);
|
||||
return;
|
||||
case UCALL_DONE:
|
||||
return;
|
||||
|
@ -30,8 +30,8 @@ static struct kvm_vm *vm;
|
||||
static void l2_guest_code(void)
|
||||
{
|
||||
/* Exit to L0 */
|
||||
asm volatile("inb %%dx, %%al"
|
||||
: : [port] "d" (PORT_L0_EXIT) : "rax");
|
||||
asm volatile("inb %%dx, %%al"
|
||||
: : [port] "d" (PORT_L0_EXIT) : "rax");
|
||||
}
|
||||
|
||||
static void l1_guest_code(struct vmx_pages *vmx_pages)
|
||||
|
@ -0,0 +1,139 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
#include "test_util.h"
|
||||
#include "kvm_util.h"
|
||||
#include "processor.h"
|
||||
|
||||
#include <signal.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include "kselftest.h"
|
||||
|
||||
#define VCPU_ID 0
|
||||
|
||||
static struct kvm_vm *vm;
|
||||
|
||||
static void guest_ud_handler(struct ex_regs *regs)
|
||||
{
|
||||
/* Loop on the ud2 until guest state is made invalid. */
|
||||
}
|
||||
|
||||
static void guest_code(void)
|
||||
{
|
||||
asm volatile("ud2");
|
||||
}
|
||||
|
||||
static void __run_vcpu_with_invalid_state(void)
|
||||
{
|
||||
struct kvm_run *run = vcpu_state(vm, VCPU_ID);
|
||||
|
||||
vcpu_run(vm, VCPU_ID);
|
||||
|
||||
TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
|
||||
"Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n",
|
||||
run->exit_reason, exit_reason_str(run->exit_reason));
|
||||
TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION,
|
||||
"Expected emulation failure, got %d\n",
|
||||
run->emulation_failure.suberror);
|
||||
}
|
||||
|
||||
static void run_vcpu_with_invalid_state(void)
|
||||
{
|
||||
/*
|
||||
* Always run twice to verify KVM handles the case where _KVM_ queues
|
||||
* an exception with invalid state and then exits to userspace, i.e.
|
||||
* that KVM doesn't explode if userspace ignores the initial error.
|
||||
*/
|
||||
__run_vcpu_with_invalid_state();
|
||||
__run_vcpu_with_invalid_state();
|
||||
}
|
||||
|
||||
static void set_timer(void)
|
||||
{
|
||||
struct itimerval timer;
|
||||
|
||||
timer.it_value.tv_sec = 0;
|
||||
timer.it_value.tv_usec = 200;
|
||||
timer.it_interval = timer.it_value;
|
||||
ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0);
|
||||
}
|
||||
|
||||
static void set_or_clear_invalid_guest_state(bool set)
|
||||
{
|
||||
static struct kvm_sregs sregs;
|
||||
|
||||
if (!sregs.cr0)
|
||||
vcpu_sregs_get(vm, VCPU_ID, &sregs);
|
||||
sregs.tr.unusable = !!set;
|
||||
vcpu_sregs_set(vm, VCPU_ID, &sregs);
|
||||
}
|
||||
|
||||
static void set_invalid_guest_state(void)
|
||||
{
|
||||
set_or_clear_invalid_guest_state(true);
|
||||
}
|
||||
|
||||
static void clear_invalid_guest_state(void)
|
||||
{
|
||||
set_or_clear_invalid_guest_state(false);
|
||||
}
|
||||
|
||||
static void sigalrm_handler(int sig)
|
||||
{
|
||||
struct kvm_vcpu_events events;
|
||||
|
||||
TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig);
|
||||
|
||||
vcpu_events_get(vm, VCPU_ID, &events);
|
||||
|
||||
/*
|
||||
* If an exception is pending, attempt KVM_RUN with invalid guest,
|
||||
* otherwise rearm the timer and keep doing so until the timer fires
|
||||
* between KVM queueing an exception and re-entering the guest.
|
||||
*/
|
||||
if (events.exception.pending) {
|
||||
set_invalid_guest_state();
|
||||
run_vcpu_with_invalid_state();
|
||||
} else {
|
||||
set_timer();
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) {
|
||||
print_skip("Must be run with kvm_intel.unrestricted_guest=0");
|
||||
exit(KSFT_SKIP);
|
||||
}
|
||||
|
||||
vm = vm_create_default(VCPU_ID, 0, (void *)guest_code);
|
||||
|
||||
vm_init_descriptor_tables(vm);
|
||||
vcpu_init_descriptor_tables(vm, VCPU_ID);
|
||||
|
||||
vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
|
||||
|
||||
/*
|
||||
* Stuff invalid guest state for L2 by making TR unusuable. The next
|
||||
* KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support
|
||||
* emulating invalid guest state for L2.
|
||||
*/
|
||||
set_invalid_guest_state();
|
||||
run_vcpu_with_invalid_state();
|
||||
|
||||
/*
|
||||
* Verify KVM also handles the case where userspace gains control while
|
||||
* an exception is pending and stuffs invalid state. Run with valid
|
||||
* guest state and a timer firing every 200us, and attempt to enter the
|
||||
* guest with invalid state when the handler interrupts KVM with an
|
||||
* exception pending.
|
||||
*/
|
||||
clear_invalid_guest_state();
|
||||
TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR,
|
||||
"Failed to register SIGALRM handler, errno = %d (%s)",
|
||||
errno, strerror(errno));
|
||||
|
||||
set_timer();
|
||||
run_vcpu_with_invalid_state();
|
||||
}
|
@ -46,20 +46,20 @@ static struct kvm_vm *vm;
|
||||
#define MIN_STEAL_TIME 50000
|
||||
|
||||
struct pvclock_vcpu_time_info {
|
||||
u32 version;
|
||||
u32 pad0;
|
||||
u64 tsc_timestamp;
|
||||
u64 system_time;
|
||||
u32 tsc_to_system_mul;
|
||||
s8 tsc_shift;
|
||||
u8 flags;
|
||||
u8 pad[2];
|
||||
u32 version;
|
||||
u32 pad0;
|
||||
u64 tsc_timestamp;
|
||||
u64 system_time;
|
||||
u32 tsc_to_system_mul;
|
||||
s8 tsc_shift;
|
||||
u8 flags;
|
||||
u8 pad[2];
|
||||
} __attribute__((__packed__)); /* 32 bytes */
|
||||
|
||||
struct pvclock_wall_clock {
|
||||
u32 version;
|
||||
u32 sec;
|
||||
u32 nsec;
|
||||
u32 version;
|
||||
u32 sec;
|
||||
u32 nsec;
|
||||
} __attribute__((__packed__));
|
||||
|
||||
struct vcpu_runstate_info {
|
||||
@ -74,11 +74,11 @@ struct arch_vcpu_info {
|
||||
};
|
||||
|
||||
struct vcpu_info {
|
||||
uint8_t evtchn_upcall_pending;
|
||||
uint8_t evtchn_upcall_mask;
|
||||
unsigned long evtchn_pending_sel;
|
||||
struct arch_vcpu_info arch;
|
||||
struct pvclock_vcpu_time_info time;
|
||||
uint8_t evtchn_upcall_pending;
|
||||
uint8_t evtchn_upcall_mask;
|
||||
unsigned long evtchn_pending_sel;
|
||||
struct arch_vcpu_info arch;
|
||||
struct pvclock_vcpu_time_info time;
|
||||
}; /* 64 bytes (x86) */
|
||||
|
||||
struct shared_info {
|
||||
@ -493,7 +493,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
vm_ts.tv_sec = wc->sec;
|
||||
vm_ts.tv_nsec = wc->nsec;
|
||||
TEST_ASSERT(wc->version && !(wc->version & 1),
|
||||
TEST_ASSERT(wc->version && !(wc->version & 1),
|
||||
"Bad wallclock version %x", wc->version);
|
||||
TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old");
|
||||
TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new");
|
||||
|
@ -427,9 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
|
||||
#endif
|
||||
kvm_async_pf_vcpu_init(vcpu);
|
||||
|
||||
vcpu->pre_pcpu = -1;
|
||||
INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
|
||||
|
||||
kvm_vcpu_set_in_spin_loop(vcpu, false);
|
||||
kvm_vcpu_set_dy_eligible(vcpu, false);
|
||||
vcpu->preempted = false;
|
||||
@ -3163,8 +3160,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
|
||||
{
|
||||
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
|
||||
|
||||
#ifdef CONFIG_HAVE_KVM_DIRTY_RING
|
||||
if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm))
|
||||
return;
|
||||
#endif
|
||||
|
||||
if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
|
||||
unsigned long rel_gfn = gfn - memslot->base_gfn;
|
||||
|
Loading…
Reference in New Issue
Block a user