mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-16 08:44:21 +08:00
KVM x86 MMU changes for 6.4:
- Tweak FNAME(sync_spte) to avoid unnecessary writes+flushes when the guest is only adding new PTEs - Overhaul .sync_page() and .invlpg() to share the .sync_page() implementation, i.e. utilize .sync_page()'s optimizations when emulating invalidations - Clean up the range-based flushing APIs - Revamp the TDP MMU's reaping of Accessed/Dirty bits to clear a single A/D bit using a LOCK AND instead of XCHG, and skip all of the "handle changed SPTE" overhead associated with writing the entire entry - Track the number of "tail" entries in a pte_list_desc to avoid having to walk (potentially) all descriptors during insertion and deletion, which gets quite expensive if the guest is spamming fork() - Misc cleanups -----BEGIN PGP SIGNATURE----- iQJGBAABCgAwFiEEMHr+pfEFOIzK+KY1YJEiAU0MEvkFAmRGsvASHHNlYW5qY0Bn b29nbGUuY29tAAoJEGCRIgFNDBL5XnoP/0D8rQmrA0xPHK81zYS1E71tsR/itO/T CQMSB4PhEqvcRUaWOuhLBRUW+noWzaOkjkMYK2uoPTdtme7v9+Ar7EtfrWYHrBWD IxHCAymo3a5dQPUc3Nb77u6HjRAOokPSqSz5jE4qAjlniW09feruro2Phi+BTme4 JjxTc/7Oh0Fu26+mK7mJHiw3fV1x3YznnnRPrKGrVQes5L6ozNICkUZ6nvuJUVMk lTNHNQbG8PqJZnfWG7VIKRn1vdfXwEfnvyucGVEqFfPLkOXqJHyqMVmIOtvsH7C5 l8j36+lBZwtFh2jk2EsXOTb6sS7l1MSvyHLlbaJaqqffP+77Hf1n0fROur0k9Yse jJJejJWxZ/SvjMt/bOA+4ybGafZH0lt20DsDWnat5GSQ1EVT1CInN2p8OY8pdecR QOJBqnNUOykC7/Pyad+IxTxwrOSNCYh+5aYG8AdGquZvNUEwjffVJqrmxDvklY8Z DTYwGKgNY7NsP/dV0WYYElsAuHiKwiDZL15KftiQebO1fPcZDpTzDo83/8UMfGxh yegngcNX9Qi7lWtLkUMy8A99UvejM0QrS/Zt8v1zjlQ8PjreZLLBWsNpe0ufIMRk 31ZAC2OS4Koi3wZ54tA7Z1Kh11meGhAk5Ti7sNke0rDqB9UMmj6UKw121cSRvW7q W6O4U3YeGpKx =zb4u -----END PGP SIGNATURE----- Merge tag 'kvm-x86-mmu-6.4' of https://github.com/kvm-x86/linux into HEAD KVM x86 MMU changes for 6.4: - Tweak FNAME(sync_spte) to avoid unnecessary writes+flushes when the guest is only adding new PTEs - Overhaul .sync_page() and .invlpg() to share the .sync_page() implementation, i.e. utilize .sync_page()'s optimizations when emulating invalidations - Clean up the range-based flushing APIs - Revamp the TDP MMU's reaping of Accessed/Dirty bits to clear a single A/D bit using a LOCK AND instead of XCHG, and skip all of the "handle changed SPTE" overhead associated with writing the entire entry - Track the number of "tail" entries in a pte_list_desc to avoid having to walk (potentially) all descriptors during insertion and deletion, which gets quite expensive if the guest is spamming fork() - Misc cleanups
This commit is contained in:
commit
807b758496
@ -54,8 +54,8 @@ KVM_X86_OP(set_rflags)
|
|||||||
KVM_X86_OP(get_if_flag)
|
KVM_X86_OP(get_if_flag)
|
||||||
KVM_X86_OP(flush_tlb_all)
|
KVM_X86_OP(flush_tlb_all)
|
||||||
KVM_X86_OP(flush_tlb_current)
|
KVM_X86_OP(flush_tlb_current)
|
||||||
KVM_X86_OP_OPTIONAL(tlb_remote_flush)
|
KVM_X86_OP_OPTIONAL(flush_remote_tlbs)
|
||||||
KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range)
|
KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range)
|
||||||
KVM_X86_OP(flush_tlb_gva)
|
KVM_X86_OP(flush_tlb_gva)
|
||||||
KVM_X86_OP(flush_tlb_guest)
|
KVM_X86_OP(flush_tlb_guest)
|
||||||
KVM_X86_OP(vcpu_pre_run)
|
KVM_X86_OP(vcpu_pre_run)
|
||||||
|
@ -420,6 +420,10 @@ struct kvm_mmu_root_info {
|
|||||||
|
|
||||||
#define KVM_MMU_NUM_PREV_ROOTS 3
|
#define KVM_MMU_NUM_PREV_ROOTS 3
|
||||||
|
|
||||||
|
#define KVM_MMU_ROOT_CURRENT BIT(0)
|
||||||
|
#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
|
||||||
|
#define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1)
|
||||||
|
|
||||||
#define KVM_HAVE_MMU_RWLOCK
|
#define KVM_HAVE_MMU_RWLOCK
|
||||||
|
|
||||||
struct kvm_mmu_page;
|
struct kvm_mmu_page;
|
||||||
@ -439,9 +443,8 @@ struct kvm_mmu {
|
|||||||
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||||
gpa_t gva_or_gpa, u64 access,
|
gpa_t gva_or_gpa, u64 access,
|
||||||
struct x86_exception *exception);
|
struct x86_exception *exception);
|
||||||
int (*sync_page)(struct kvm_vcpu *vcpu,
|
int (*sync_spte)(struct kvm_vcpu *vcpu,
|
||||||
struct kvm_mmu_page *sp);
|
struct kvm_mmu_page *sp, int i);
|
||||||
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
|
|
||||||
struct kvm_mmu_root_info root;
|
struct kvm_mmu_root_info root;
|
||||||
union kvm_cpu_role cpu_role;
|
union kvm_cpu_role cpu_role;
|
||||||
union kvm_mmu_page_role root_role;
|
union kvm_mmu_page_role root_role;
|
||||||
@ -479,11 +482,6 @@ struct kvm_mmu {
|
|||||||
u64 pdptrs[4]; /* pae */
|
u64 pdptrs[4]; /* pae */
|
||||||
};
|
};
|
||||||
|
|
||||||
struct kvm_tlb_range {
|
|
||||||
u64 start_gfn;
|
|
||||||
u64 pages;
|
|
||||||
};
|
|
||||||
|
|
||||||
enum pmc_type {
|
enum pmc_type {
|
||||||
KVM_PMC_GP = 0,
|
KVM_PMC_GP = 0,
|
||||||
KVM_PMC_FIXED,
|
KVM_PMC_FIXED,
|
||||||
@ -1585,9 +1583,9 @@ struct kvm_x86_ops {
|
|||||||
|
|
||||||
void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
|
void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
|
||||||
void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
|
void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
|
||||||
int (*tlb_remote_flush)(struct kvm *kvm);
|
int (*flush_remote_tlbs)(struct kvm *kvm);
|
||||||
int (*tlb_remote_flush_with_range)(struct kvm *kvm,
|
int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn,
|
||||||
struct kvm_tlb_range *range);
|
gfn_t nr_pages);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Flush any TLB entries associated with the given GVA.
|
* Flush any TLB entries associated with the given GVA.
|
||||||
@ -1791,8 +1789,8 @@ void kvm_arch_free_vm(struct kvm *kvm);
|
|||||||
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
|
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
|
||||||
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
|
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
|
||||||
{
|
{
|
||||||
if (kvm_x86_ops.tlb_remote_flush &&
|
if (kvm_x86_ops.flush_remote_tlbs &&
|
||||||
!static_call(kvm_x86_tlb_remote_flush)(kvm))
|
!static_call(kvm_x86_flush_remote_tlbs)(kvm))
|
||||||
return 0;
|
return 0;
|
||||||
else
|
else
|
||||||
return -ENOTSUPP;
|
return -ENOTSUPP;
|
||||||
@ -1997,10 +1995,6 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
|
|||||||
return !!(*irq_state);
|
return !!(*irq_state);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define KVM_MMU_ROOT_CURRENT BIT(0)
|
|
||||||
#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
|
|
||||||
#define KVM_MMU_ROOTS_ALL (~0UL)
|
|
||||||
|
|
||||||
int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
|
int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
|
||||||
void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
|
void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
|
||||||
|
|
||||||
@ -2044,8 +2038,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
|
|||||||
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
|
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
|
||||||
void *insn, int insn_len);
|
void *insn, int insn_len);
|
||||||
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
|
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
|
||||||
void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||||
gva_t gva, hpa_t root_hpa);
|
u64 addr, unsigned long roots);
|
||||||
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
|
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
|
||||||
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
|
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
|
||||||
|
|
||||||
|
@ -10,17 +10,22 @@
|
|||||||
#include "hyperv.h"
|
#include "hyperv.h"
|
||||||
#include "kvm_onhyperv.h"
|
#include "kvm_onhyperv.h"
|
||||||
|
|
||||||
|
struct kvm_hv_tlb_range {
|
||||||
|
u64 start_gfn;
|
||||||
|
u64 pages;
|
||||||
|
};
|
||||||
|
|
||||||
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
|
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
|
||||||
void *data)
|
void *data)
|
||||||
{
|
{
|
||||||
struct kvm_tlb_range *range = data;
|
struct kvm_hv_tlb_range *range = data;
|
||||||
|
|
||||||
return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
|
return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
|
||||||
range->pages);
|
range->pages);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
|
static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
|
||||||
struct kvm_tlb_range *range)
|
struct kvm_hv_tlb_range *range)
|
||||||
{
|
{
|
||||||
if (range)
|
if (range)
|
||||||
return hyperv_flush_guest_mapping_range(root_tdp,
|
return hyperv_flush_guest_mapping_range(root_tdp,
|
||||||
@ -29,8 +34,8 @@ static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
|
|||||||
return hyperv_flush_guest_mapping(root_tdp);
|
return hyperv_flush_guest_mapping(root_tdp);
|
||||||
}
|
}
|
||||||
|
|
||||||
int hv_remote_flush_tlb_with_range(struct kvm *kvm,
|
static int __hv_flush_remote_tlbs_range(struct kvm *kvm,
|
||||||
struct kvm_tlb_range *range)
|
struct kvm_hv_tlb_range *range)
|
||||||
{
|
{
|
||||||
struct kvm_arch *kvm_arch = &kvm->arch;
|
struct kvm_arch *kvm_arch = &kvm->arch;
|
||||||
struct kvm_vcpu *vcpu;
|
struct kvm_vcpu *vcpu;
|
||||||
@ -86,19 +91,29 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
|
|||||||
spin_unlock(&kvm_arch->hv_root_tdp_lock);
|
spin_unlock(&kvm_arch->hv_root_tdp_lock);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range);
|
|
||||||
|
|
||||||
int hv_remote_flush_tlb(struct kvm *kvm)
|
int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages)
|
||||||
{
|
{
|
||||||
return hv_remote_flush_tlb_with_range(kvm, NULL);
|
struct kvm_hv_tlb_range range = {
|
||||||
|
.start_gfn = start_gfn,
|
||||||
|
.pages = nr_pages,
|
||||||
|
};
|
||||||
|
|
||||||
|
return __hv_flush_remote_tlbs_range(kvm, &range);
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
|
EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range);
|
||||||
|
|
||||||
|
int hv_flush_remote_tlbs(struct kvm *kvm)
|
||||||
|
{
|
||||||
|
return __hv_flush_remote_tlbs_range(kvm, NULL);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs);
|
||||||
|
|
||||||
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
|
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
|
||||||
{
|
{
|
||||||
struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
|
struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
|
||||||
|
|
||||||
if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
|
if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) {
|
||||||
spin_lock(&kvm_arch->hv_root_tdp_lock);
|
spin_lock(&kvm_arch->hv_root_tdp_lock);
|
||||||
vcpu->arch.hv_root_tdp = root_tdp;
|
vcpu->arch.hv_root_tdp = root_tdp;
|
||||||
if (root_tdp != kvm_arch->hv_root_tdp)
|
if (root_tdp != kvm_arch->hv_root_tdp)
|
||||||
|
@ -7,9 +7,8 @@
|
|||||||
#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
|
#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
|
||||||
|
|
||||||
#if IS_ENABLED(CONFIG_HYPERV)
|
#if IS_ENABLED(CONFIG_HYPERV)
|
||||||
int hv_remote_flush_tlb_with_range(struct kvm *kvm,
|
int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
|
||||||
struct kvm_tlb_range *range);
|
int hv_flush_remote_tlbs(struct kvm *kvm);
|
||||||
int hv_remote_flush_tlb(struct kvm *kvm);
|
|
||||||
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
|
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
|
||||||
#else /* !CONFIG_HYPERV */
|
#else /* !CONFIG_HYPERV */
|
||||||
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
|
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
|
||||||
|
@ -125,17 +125,31 @@ module_param(dbg, bool, 0644);
|
|||||||
#define PTE_LIST_EXT 14
|
#define PTE_LIST_EXT 14
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Slight optimization of cacheline layout, by putting `more' and `spte_count'
|
* struct pte_list_desc is the core data structure used to implement a custom
|
||||||
* at the start; then accessing it will only use one single cacheline for
|
* list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
|
||||||
* either full (entries==PTE_LIST_EXT) case or entries<=6.
|
* given GFN when used in the context of rmaps. Using a custom list allows KVM
|
||||||
|
* to optimize for the common case where many GFNs will have at most a handful
|
||||||
|
* of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
|
||||||
|
* memory footprint, which in turn improves runtime performance by exploiting
|
||||||
|
* cache locality.
|
||||||
|
*
|
||||||
|
* A list is comprised of one or more pte_list_desc objects (descriptors).
|
||||||
|
* Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
|
||||||
|
* is full and a new SPTEs needs to be added, a new descriptor is allocated and
|
||||||
|
* becomes the head of the list. This means that by definitions, all tail
|
||||||
|
* descriptors are full.
|
||||||
|
*
|
||||||
|
* Note, the meta data fields are deliberately placed at the start of the
|
||||||
|
* structure to optimize the cacheline layout; accessing the descriptor will
|
||||||
|
* touch only a single cacheline so long as @spte_count<=6 (or if only the
|
||||||
|
* descriptors metadata is accessed).
|
||||||
*/
|
*/
|
||||||
struct pte_list_desc {
|
struct pte_list_desc {
|
||||||
struct pte_list_desc *more;
|
struct pte_list_desc *more;
|
||||||
/*
|
/* The number of PTEs stored in _this_ descriptor. */
|
||||||
* Stores number of entries stored in the pte_list_desc. No need to be
|
u32 spte_count;
|
||||||
* u64 but just for easier alignment. When PTE_LIST_EXT, means full.
|
/* The number of PTEs stored in all tails of this descriptor. */
|
||||||
*/
|
u32 tail_count;
|
||||||
u64 spte_count;
|
|
||||||
u64 *sptes[PTE_LIST_EXT];
|
u64 *sptes[PTE_LIST_EXT];
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -242,34 +256,37 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
|
|||||||
return regs;
|
return regs;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool kvm_available_flush_tlb_with_range(void)
|
static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
|
||||||
{
|
{
|
||||||
return kvm_x86_ops.tlb_remote_flush_with_range;
|
return kvm_read_cr3(vcpu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
|
static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
|
||||||
struct kvm_tlb_range *range)
|
struct kvm_mmu *mmu)
|
||||||
{
|
{
|
||||||
int ret = -ENOTSUPP;
|
if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
|
||||||
|
return kvm_read_cr3(vcpu);
|
||||||
|
|
||||||
if (range && kvm_x86_ops.tlb_remote_flush_with_range)
|
return mmu->get_guest_pgd(vcpu);
|
||||||
ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
|
}
|
||||||
|
|
||||||
|
static inline bool kvm_available_flush_remote_tlbs_range(void)
|
||||||
|
{
|
||||||
|
return kvm_x86_ops.flush_remote_tlbs_range;
|
||||||
|
}
|
||||||
|
|
||||||
|
void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
|
||||||
|
gfn_t nr_pages)
|
||||||
|
{
|
||||||
|
int ret = -EOPNOTSUPP;
|
||||||
|
|
||||||
|
if (kvm_x86_ops.flush_remote_tlbs_range)
|
||||||
|
ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
|
||||||
|
nr_pages);
|
||||||
if (ret)
|
if (ret)
|
||||||
kvm_flush_remote_tlbs(kvm);
|
kvm_flush_remote_tlbs(kvm);
|
||||||
}
|
}
|
||||||
|
|
||||||
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
|
|
||||||
u64 start_gfn, u64 pages)
|
|
||||||
{
|
|
||||||
struct kvm_tlb_range range;
|
|
||||||
|
|
||||||
range.start_gfn = start_gfn;
|
|
||||||
range.pages = pages;
|
|
||||||
|
|
||||||
kvm_flush_remote_tlbs_with_range(kvm, &range);
|
|
||||||
}
|
|
||||||
|
|
||||||
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
|
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
|
||||||
|
|
||||||
/* Flush the range of guest memory mapped by the given SPTE. */
|
/* Flush the range of guest memory mapped by the given SPTE. */
|
||||||
@ -888,9 +905,9 @@ static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
|||||||
untrack_possible_nx_huge_page(kvm, sp);
|
untrack_possible_nx_huge_page(kvm, sp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct kvm_memory_slot *
|
static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
|
||||||
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
|
gfn_t gfn,
|
||||||
bool no_dirty_log)
|
bool no_dirty_log)
|
||||||
{
|
{
|
||||||
struct kvm_memory_slot *slot;
|
struct kvm_memory_slot *slot;
|
||||||
|
|
||||||
@ -929,53 +946,69 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
|
|||||||
desc->sptes[0] = (u64 *)rmap_head->val;
|
desc->sptes[0] = (u64 *)rmap_head->val;
|
||||||
desc->sptes[1] = spte;
|
desc->sptes[1] = spte;
|
||||||
desc->spte_count = 2;
|
desc->spte_count = 2;
|
||||||
|
desc->tail_count = 0;
|
||||||
rmap_head->val = (unsigned long)desc | 1;
|
rmap_head->val = (unsigned long)desc | 1;
|
||||||
++count;
|
++count;
|
||||||
} else {
|
} else {
|
||||||
rmap_printk("%p %llx many->many\n", spte, *spte);
|
rmap_printk("%p %llx many->many\n", spte, *spte);
|
||||||
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
|
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
|
||||||
while (desc->spte_count == PTE_LIST_EXT) {
|
count = desc->tail_count + desc->spte_count;
|
||||||
count += PTE_LIST_EXT;
|
|
||||||
if (!desc->more) {
|
/*
|
||||||
desc->more = kvm_mmu_memory_cache_alloc(cache);
|
* If the previous head is full, allocate a new head descriptor
|
||||||
desc = desc->more;
|
* as tail descriptors are always kept full.
|
||||||
desc->spte_count = 0;
|
*/
|
||||||
break;
|
if (desc->spte_count == PTE_LIST_EXT) {
|
||||||
}
|
desc = kvm_mmu_memory_cache_alloc(cache);
|
||||||
desc = desc->more;
|
desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
|
||||||
|
desc->spte_count = 0;
|
||||||
|
desc->tail_count = count;
|
||||||
|
rmap_head->val = (unsigned long)desc | 1;
|
||||||
}
|
}
|
||||||
count += desc->spte_count;
|
|
||||||
desc->sptes[desc->spte_count++] = spte;
|
desc->sptes[desc->spte_count++] = spte;
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
|
||||||
pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
|
struct pte_list_desc *desc, int i)
|
||||||
struct pte_list_desc *desc, int i,
|
|
||||||
struct pte_list_desc *prev_desc)
|
|
||||||
{
|
{
|
||||||
int j = desc->spte_count - 1;
|
struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
|
||||||
|
int j = head_desc->spte_count - 1;
|
||||||
|
|
||||||
desc->sptes[i] = desc->sptes[j];
|
/*
|
||||||
desc->sptes[j] = NULL;
|
* The head descriptor should never be empty. A new head is added only
|
||||||
desc->spte_count--;
|
* when adding an entry and the previous head is full, and heads are
|
||||||
if (desc->spte_count)
|
* removed (this flow) when they become empty.
|
||||||
|
*/
|
||||||
|
BUG_ON(j < 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Replace the to-be-freed SPTE with the last valid entry from the head
|
||||||
|
* descriptor to ensure that tail descriptors are full at all times.
|
||||||
|
* Note, this also means that tail_count is stable for each descriptor.
|
||||||
|
*/
|
||||||
|
desc->sptes[i] = head_desc->sptes[j];
|
||||||
|
head_desc->sptes[j] = NULL;
|
||||||
|
head_desc->spte_count--;
|
||||||
|
if (head_desc->spte_count)
|
||||||
return;
|
return;
|
||||||
if (!prev_desc && !desc->more)
|
|
||||||
|
/*
|
||||||
|
* The head descriptor is empty. If there are no tail descriptors,
|
||||||
|
* nullify the rmap head to mark the list as emtpy, else point the rmap
|
||||||
|
* head at the next descriptor, i.e. the new head.
|
||||||
|
*/
|
||||||
|
if (!head_desc->more)
|
||||||
rmap_head->val = 0;
|
rmap_head->val = 0;
|
||||||
else
|
else
|
||||||
if (prev_desc)
|
rmap_head->val = (unsigned long)head_desc->more | 1;
|
||||||
prev_desc->more = desc->more;
|
mmu_free_pte_list_desc(head_desc);
|
||||||
else
|
|
||||||
rmap_head->val = (unsigned long)desc->more | 1;
|
|
||||||
mmu_free_pte_list_desc(desc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
|
static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
|
||||||
{
|
{
|
||||||
struct pte_list_desc *desc;
|
struct pte_list_desc *desc;
|
||||||
struct pte_list_desc *prev_desc;
|
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (!rmap_head->val) {
|
if (!rmap_head->val) {
|
||||||
@ -991,16 +1024,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
|
|||||||
} else {
|
} else {
|
||||||
rmap_printk("%p many->many\n", spte);
|
rmap_printk("%p many->many\n", spte);
|
||||||
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
|
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
|
||||||
prev_desc = NULL;
|
|
||||||
while (desc) {
|
while (desc) {
|
||||||
for (i = 0; i < desc->spte_count; ++i) {
|
for (i = 0; i < desc->spte_count; ++i) {
|
||||||
if (desc->sptes[i] == spte) {
|
if (desc->sptes[i] == spte) {
|
||||||
pte_list_desc_remove_entry(rmap_head,
|
pte_list_desc_remove_entry(rmap_head, desc, i);
|
||||||
desc, i, prev_desc);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
prev_desc = desc;
|
|
||||||
desc = desc->more;
|
desc = desc->more;
|
||||||
}
|
}
|
||||||
pr_err("%s: %p many->many\n", __func__, spte);
|
pr_err("%s: %p many->many\n", __func__, spte);
|
||||||
@ -1047,7 +1077,6 @@ out:
|
|||||||
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
|
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
|
||||||
{
|
{
|
||||||
struct pte_list_desc *desc;
|
struct pte_list_desc *desc;
|
||||||
unsigned int count = 0;
|
|
||||||
|
|
||||||
if (!rmap_head->val)
|
if (!rmap_head->val)
|
||||||
return 0;
|
return 0;
|
||||||
@ -1055,13 +1084,7 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
|
|||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
|
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
|
||||||
|
return desc->tail_count + desc->spte_count;
|
||||||
while (desc) {
|
|
||||||
count += desc->spte_count;
|
|
||||||
desc = desc->more;
|
|
||||||
}
|
|
||||||
|
|
||||||
return count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
|
static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
|
||||||
@ -1073,14 +1096,6 @@ static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
|
|||||||
return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
|
return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool rmap_can_add(struct kvm_vcpu *vcpu)
|
|
||||||
{
|
|
||||||
struct kvm_mmu_memory_cache *mc;
|
|
||||||
|
|
||||||
mc = &vcpu->arch.mmu_pte_list_desc_cache;
|
|
||||||
return kvm_mmu_memory_cache_nr_free_objects(mc);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void rmap_remove(struct kvm *kvm, u64 *spte)
|
static void rmap_remove(struct kvm *kvm, u64 *spte)
|
||||||
{
|
{
|
||||||
struct kvm_memslots *slots;
|
struct kvm_memslots *slots;
|
||||||
@ -1479,7 +1494,7 @@ restart:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (need_flush && kvm_available_flush_tlb_with_range()) {
|
if (need_flush && kvm_available_flush_remote_tlbs_range()) {
|
||||||
kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
|
kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1504,8 +1519,8 @@ struct slot_rmap_walk_iterator {
|
|||||||
struct kvm_rmap_head *end_rmap;
|
struct kvm_rmap_head *end_rmap;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void
|
static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
|
||||||
rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
|
int level)
|
||||||
{
|
{
|
||||||
iterator->level = level;
|
iterator->level = level;
|
||||||
iterator->gfn = iterator->start_gfn;
|
iterator->gfn = iterator->start_gfn;
|
||||||
@ -1513,10 +1528,10 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
|
|||||||
iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
|
iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
|
||||||
slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
|
const struct kvm_memory_slot *slot,
|
||||||
const struct kvm_memory_slot *slot, int start_level,
|
int start_level, int end_level,
|
||||||
int end_level, gfn_t start_gfn, gfn_t end_gfn)
|
gfn_t start_gfn, gfn_t end_gfn)
|
||||||
{
|
{
|
||||||
iterator->slot = slot;
|
iterator->slot = slot;
|
||||||
iterator->start_level = start_level;
|
iterator->start_level = start_level;
|
||||||
@ -1789,12 +1804,6 @@ static void mark_unsync(u64 *spte)
|
|||||||
kvm_mmu_mark_parents_unsync(sp);
|
kvm_mmu_mark_parents_unsync(sp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
|
|
||||||
struct kvm_mmu_page *sp)
|
|
||||||
{
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define KVM_PAGE_ARRAY_NR 16
|
#define KVM_PAGE_ARRAY_NR 16
|
||||||
|
|
||||||
struct kvm_mmu_pages {
|
struct kvm_mmu_pages {
|
||||||
@ -1914,10 +1923,79 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
|
|||||||
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
|
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
|
||||||
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
|
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
|
||||||
|
|
||||||
|
static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
||||||
|
{
|
||||||
|
union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Ignore various flags when verifying that it's safe to sync a shadow
|
||||||
|
* page using the current MMU context.
|
||||||
|
*
|
||||||
|
* - level: not part of the overall MMU role and will never match as the MMU's
|
||||||
|
* level tracks the root level
|
||||||
|
* - access: updated based on the new guest PTE
|
||||||
|
* - quadrant: not part of the overall MMU role (similar to level)
|
||||||
|
*/
|
||||||
|
const union kvm_mmu_page_role sync_role_ign = {
|
||||||
|
.level = 0xf,
|
||||||
|
.access = 0x7,
|
||||||
|
.quadrant = 0x3,
|
||||||
|
.passthrough = 0x1,
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Direct pages can never be unsync, and KVM should never attempt to
|
||||||
|
* sync a shadow page for a different MMU context, e.g. if the role
|
||||||
|
* differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
|
||||||
|
* reserved bits checks will be wrong, etc...
|
||||||
|
*/
|
||||||
|
if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
|
||||||
|
(sp->role.word ^ root_role.word) & ~sync_role_ign.word))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
|
||||||
|
{
|
||||||
|
if (!sp->spt[i])
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
||||||
|
{
|
||||||
|
int flush = 0;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (!kvm_sync_page_check(vcpu, sp))
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
|
||||||
|
int ret = kvm_sync_spte(vcpu, sp, i);
|
||||||
|
|
||||||
|
if (ret < -1)
|
||||||
|
return -1;
|
||||||
|
flush |= ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note, any flush is purely for KVM's correctness, e.g. when dropping
|
||||||
|
* an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
|
||||||
|
* unmap or dirty logging event doesn't fail to flush. The guest is
|
||||||
|
* responsible for flushing the TLB to ensure any changes in protection
|
||||||
|
* bits are recognized, i.e. until the guest flushes or page faults on
|
||||||
|
* a relevant address, KVM is architecturally allowed to let vCPUs use
|
||||||
|
* cached translations with the old protection bits.
|
||||||
|
*/
|
||||||
|
return flush;
|
||||||
|
}
|
||||||
|
|
||||||
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||||
struct list_head *invalid_list)
|
struct list_head *invalid_list)
|
||||||
{
|
{
|
||||||
int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
|
int ret = __kvm_sync_page(vcpu, sp);
|
||||||
|
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
|
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
|
||||||
@ -3304,9 +3382,9 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
|
|||||||
* Returns true if the SPTE was fixed successfully. Otherwise,
|
* Returns true if the SPTE was fixed successfully. Otherwise,
|
||||||
* someone else modified the SPTE from its original value.
|
* someone else modified the SPTE from its original value.
|
||||||
*/
|
*/
|
||||||
static bool
|
static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
|
||||||
fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
|
struct kvm_page_fault *fault,
|
||||||
u64 *sptep, u64 old_spte, u64 new_spte)
|
u64 *sptep, u64 old_spte, u64 new_spte)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Theoretically we could also set dirty bit (and flush TLB) here in
|
* Theoretically we could also set dirty bit (and flush TLB) here in
|
||||||
@ -3513,6 +3591,8 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
|
|||||||
LIST_HEAD(invalid_list);
|
LIST_HEAD(invalid_list);
|
||||||
bool free_active_root;
|
bool free_active_root;
|
||||||
|
|
||||||
|
WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
|
||||||
|
|
||||||
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
|
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
|
||||||
|
|
||||||
/* Before acquiring the MMU lock, see if we need to do any real work. */
|
/* Before acquiring the MMU lock, see if we need to do any real work. */
|
||||||
@ -3731,7 +3811,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
|
|||||||
int quadrant, i, r;
|
int quadrant, i, r;
|
||||||
hpa_t root;
|
hpa_t root;
|
||||||
|
|
||||||
root_pgd = mmu->get_guest_pgd(vcpu);
|
root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
|
||||||
root_gfn = root_pgd >> PAGE_SHIFT;
|
root_gfn = root_pgd >> PAGE_SHIFT;
|
||||||
|
|
||||||
if (mmu_check_root(vcpu, root_gfn))
|
if (mmu_check_root(vcpu, root_gfn))
|
||||||
@ -4181,7 +4261,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
|
|||||||
arch.token = alloc_apf_token(vcpu);
|
arch.token = alloc_apf_token(vcpu);
|
||||||
arch.gfn = gfn;
|
arch.gfn = gfn;
|
||||||
arch.direct_map = vcpu->arch.mmu->root_role.direct;
|
arch.direct_map = vcpu->arch.mmu->root_role.direct;
|
||||||
arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
|
arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
|
||||||
|
|
||||||
return kvm_setup_async_pf(vcpu, cr2_or_gpa,
|
return kvm_setup_async_pf(vcpu, cr2_or_gpa,
|
||||||
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
|
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
|
||||||
@ -4200,7 +4280,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
if (!vcpu->arch.mmu->root_role.direct &&
|
if (!vcpu->arch.mmu->root_role.direct &&
|
||||||
work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
|
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
|
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
|
||||||
@ -4469,8 +4549,7 @@ static void nonpaging_init_context(struct kvm_mmu *context)
|
|||||||
{
|
{
|
||||||
context->page_fault = nonpaging_page_fault;
|
context->page_fault = nonpaging_page_fault;
|
||||||
context->gva_to_gpa = nonpaging_gva_to_gpa;
|
context->gva_to_gpa = nonpaging_gva_to_gpa;
|
||||||
context->sync_page = nonpaging_sync_page;
|
context->sync_spte = NULL;
|
||||||
context->invlpg = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
|
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
|
||||||
@ -4604,11 +4683,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
|
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
|
||||||
|
|
||||||
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
|
|
||||||
{
|
|
||||||
return kvm_read_cr3(vcpu);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
|
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
|
||||||
unsigned int access)
|
unsigned int access)
|
||||||
{
|
{
|
||||||
@ -4638,10 +4712,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
|
|||||||
#include "paging_tmpl.h"
|
#include "paging_tmpl.h"
|
||||||
#undef PTTYPE
|
#undef PTTYPE
|
||||||
|
|
||||||
static void
|
static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
|
||||||
__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
|
u64 pa_bits_rsvd, int level, bool nx,
|
||||||
u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
|
bool gbpages, bool pse, bool amd)
|
||||||
bool pse, bool amd)
|
|
||||||
{
|
{
|
||||||
u64 gbpages_bit_rsvd = 0;
|
u64 gbpages_bit_rsvd = 0;
|
||||||
u64 nonleaf_bit8_rsvd = 0;
|
u64 nonleaf_bit8_rsvd = 0;
|
||||||
@ -4754,9 +4827,9 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
|
|||||||
guest_cpuid_is_amd_or_hygon(vcpu));
|
guest_cpuid_is_amd_or_hygon(vcpu));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
|
||||||
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
|
u64 pa_bits_rsvd, bool execonly,
|
||||||
u64 pa_bits_rsvd, bool execonly, int huge_page_level)
|
int huge_page_level)
|
||||||
{
|
{
|
||||||
u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
|
u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
|
||||||
u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
|
u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
|
||||||
@ -4856,8 +4929,7 @@ static inline bool boot_cpu_is_amd(void)
|
|||||||
* the direct page table on host, use as much mmu features as
|
* the direct page table on host, use as much mmu features as
|
||||||
* possible, however, kvm currently does not do execution-protection.
|
* possible, however, kvm currently does not do execution-protection.
|
||||||
*/
|
*/
|
||||||
static void
|
static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
|
||||||
reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
|
|
||||||
{
|
{
|
||||||
struct rsvd_bits_validate *shadow_zero_check;
|
struct rsvd_bits_validate *shadow_zero_check;
|
||||||
int i;
|
int i;
|
||||||
@ -5060,20 +5132,18 @@ static void paging64_init_context(struct kvm_mmu *context)
|
|||||||
{
|
{
|
||||||
context->page_fault = paging64_page_fault;
|
context->page_fault = paging64_page_fault;
|
||||||
context->gva_to_gpa = paging64_gva_to_gpa;
|
context->gva_to_gpa = paging64_gva_to_gpa;
|
||||||
context->sync_page = paging64_sync_page;
|
context->sync_spte = paging64_sync_spte;
|
||||||
context->invlpg = paging64_invlpg;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void paging32_init_context(struct kvm_mmu *context)
|
static void paging32_init_context(struct kvm_mmu *context)
|
||||||
{
|
{
|
||||||
context->page_fault = paging32_page_fault;
|
context->page_fault = paging32_page_fault;
|
||||||
context->gva_to_gpa = paging32_gva_to_gpa;
|
context->gva_to_gpa = paging32_gva_to_gpa;
|
||||||
context->sync_page = paging32_sync_page;
|
context->sync_spte = paging32_sync_spte;
|
||||||
context->invlpg = paging32_invlpg;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static union kvm_cpu_role
|
static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
|
||||||
kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
|
const struct kvm_mmu_role_regs *regs)
|
||||||
{
|
{
|
||||||
union kvm_cpu_role role = {0};
|
union kvm_cpu_role role = {0};
|
||||||
|
|
||||||
@ -5172,9 +5242,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
|
|||||||
context->cpu_role.as_u64 = cpu_role.as_u64;
|
context->cpu_role.as_u64 = cpu_role.as_u64;
|
||||||
context->root_role.word = root_role.word;
|
context->root_role.word = root_role.word;
|
||||||
context->page_fault = kvm_tdp_page_fault;
|
context->page_fault = kvm_tdp_page_fault;
|
||||||
context->sync_page = nonpaging_sync_page;
|
context->sync_spte = NULL;
|
||||||
context->invlpg = NULL;
|
context->get_guest_pgd = get_guest_cr3;
|
||||||
context->get_guest_pgd = get_cr3;
|
|
||||||
context->get_pdptr = kvm_pdptr_read;
|
context->get_pdptr = kvm_pdptr_read;
|
||||||
context->inject_page_fault = kvm_inject_page_fault;
|
context->inject_page_fault = kvm_inject_page_fault;
|
||||||
|
|
||||||
@ -5304,8 +5373,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
|
|||||||
|
|
||||||
context->page_fault = ept_page_fault;
|
context->page_fault = ept_page_fault;
|
||||||
context->gva_to_gpa = ept_gva_to_gpa;
|
context->gva_to_gpa = ept_gva_to_gpa;
|
||||||
context->sync_page = ept_sync_page;
|
context->sync_spte = ept_sync_spte;
|
||||||
context->invlpg = ept_invlpg;
|
|
||||||
|
|
||||||
update_permission_bitmask(context, true);
|
update_permission_bitmask(context, true);
|
||||||
context->pkru_mask = 0;
|
context->pkru_mask = 0;
|
||||||
@ -5324,7 +5392,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
|
|||||||
|
|
||||||
kvm_init_shadow_mmu(vcpu, cpu_role);
|
kvm_init_shadow_mmu(vcpu, cpu_role);
|
||||||
|
|
||||||
context->get_guest_pgd = get_cr3;
|
context->get_guest_pgd = get_guest_cr3;
|
||||||
context->get_pdptr = kvm_pdptr_read;
|
context->get_pdptr = kvm_pdptr_read;
|
||||||
context->inject_page_fault = kvm_inject_page_fault;
|
context->inject_page_fault = kvm_inject_page_fault;
|
||||||
}
|
}
|
||||||
@ -5338,7 +5406,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
g_context->cpu_role.as_u64 = new_mode.as_u64;
|
g_context->cpu_role.as_u64 = new_mode.as_u64;
|
||||||
g_context->get_guest_pgd = get_cr3;
|
g_context->get_guest_pgd = get_guest_cr3;
|
||||||
g_context->get_pdptr = kvm_pdptr_read;
|
g_context->get_pdptr = kvm_pdptr_read;
|
||||||
g_context->inject_page_fault = kvm_inject_page_fault;
|
g_context->inject_page_fault = kvm_inject_page_fault;
|
||||||
|
|
||||||
@ -5346,7 +5414,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
|
|||||||
* L2 page tables are never shadowed, so there is no need to sync
|
* L2 page tables are never shadowed, so there is no need to sync
|
||||||
* SPTEs.
|
* SPTEs.
|
||||||
*/
|
*/
|
||||||
g_context->invlpg = NULL;
|
g_context->sync_spte = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
|
* Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
|
||||||
@ -5722,48 +5790,77 @@ emulate:
|
|||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
|
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
|
||||||
|
|
||||||
void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||||
gva_t gva, hpa_t root_hpa)
|
u64 addr, hpa_t root_hpa)
|
||||||
|
{
|
||||||
|
struct kvm_shadow_walk_iterator iterator;
|
||||||
|
|
||||||
|
vcpu_clear_mmio_info(vcpu, addr);
|
||||||
|
|
||||||
|
if (!VALID_PAGE(root_hpa))
|
||||||
|
return;
|
||||||
|
|
||||||
|
write_lock(&vcpu->kvm->mmu_lock);
|
||||||
|
for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
|
||||||
|
struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
|
||||||
|
|
||||||
|
if (sp->unsync) {
|
||||||
|
int ret = kvm_sync_spte(vcpu, sp, iterator.index);
|
||||||
|
|
||||||
|
if (ret < 0)
|
||||||
|
mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
|
||||||
|
if (ret)
|
||||||
|
kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!sp->unsync_children)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
write_unlock(&vcpu->kvm->mmu_lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||||
|
u64 addr, unsigned long roots)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
|
||||||
|
|
||||||
/* It's actually a GPA for vcpu->arch.guest_mmu. */
|
/* It's actually a GPA for vcpu->arch.guest_mmu. */
|
||||||
if (mmu != &vcpu->arch.guest_mmu) {
|
if (mmu != &vcpu->arch.guest_mmu) {
|
||||||
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
|
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
|
||||||
if (is_noncanonical_address(gva, vcpu))
|
if (is_noncanonical_address(addr, vcpu))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
|
static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!mmu->invlpg)
|
if (!mmu->sync_spte)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (root_hpa == INVALID_PAGE) {
|
if (roots & KVM_MMU_ROOT_CURRENT)
|
||||||
mmu->invlpg(vcpu, gva, mmu->root.hpa);
|
__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
|
||||||
|
|
||||||
/*
|
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
|
||||||
* INVLPG is required to invalidate any global mappings for the VA,
|
if (roots & KVM_MMU_ROOT_PREVIOUS(i))
|
||||||
* irrespective of PCID. Since it would take us roughly similar amount
|
__kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
|
||||||
* of work to determine whether any of the prev_root mappings of the VA
|
|
||||||
* is marked global, or to just sync it blindly, so we might as well
|
|
||||||
* just always sync it.
|
|
||||||
*
|
|
||||||
* Mappings not reachable via the current cr3 or the prev_roots will be
|
|
||||||
* synced when switching to that cr3, so nothing needs to be done here
|
|
||||||
* for them.
|
|
||||||
*/
|
|
||||||
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
|
|
||||||
if (VALID_PAGE(mmu->prev_roots[i].hpa))
|
|
||||||
mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
|
|
||||||
} else {
|
|
||||||
mmu->invlpg(vcpu, gva, root_hpa);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
|
||||||
|
|
||||||
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
|
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
|
||||||
{
|
{
|
||||||
kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
|
/*
|
||||||
|
* INVLPG is required to invalidate any global mappings for the VA,
|
||||||
|
* irrespective of PCID. Blindly sync all roots as it would take
|
||||||
|
* roughly the same amount of work/time to determine whether any of the
|
||||||
|
* previous roots have a global mapping.
|
||||||
|
*
|
||||||
|
* Mappings not reachable via the current or previous cached roots will
|
||||||
|
* be synced when switching to that new cr3, so nothing needs to be
|
||||||
|
* done here for them.
|
||||||
|
*/
|
||||||
|
kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
|
||||||
++vcpu->stat.invlpg;
|
++vcpu->stat.invlpg;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
|
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
|
||||||
@ -5772,27 +5869,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
|
|||||||
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
|
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
|
||||||
{
|
{
|
||||||
struct kvm_mmu *mmu = vcpu->arch.mmu;
|
struct kvm_mmu *mmu = vcpu->arch.mmu;
|
||||||
bool tlb_flush = false;
|
unsigned long roots = 0;
|
||||||
uint i;
|
uint i;
|
||||||
|
|
||||||
if (pcid == kvm_get_active_pcid(vcpu)) {
|
if (pcid == kvm_get_active_pcid(vcpu))
|
||||||
if (mmu->invlpg)
|
roots |= KVM_MMU_ROOT_CURRENT;
|
||||||
mmu->invlpg(vcpu, gva, mmu->root.hpa);
|
|
||||||
tlb_flush = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
|
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
|
||||||
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
|
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
|
||||||
pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
|
pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
|
||||||
if (mmu->invlpg)
|
roots |= KVM_MMU_ROOT_PREVIOUS(i);
|
||||||
mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
|
|
||||||
tlb_flush = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tlb_flush)
|
if (roots)
|
||||||
static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
|
kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
|
||||||
|
|
||||||
++vcpu->stat.invlpg;
|
++vcpu->stat.invlpg;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -5829,29 +5919,30 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
|
|||||||
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
|
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
|
||||||
|
|
||||||
/* The return value indicates if tlb flush on all vcpus is needed. */
|
/* The return value indicates if tlb flush on all vcpus is needed. */
|
||||||
typedef bool (*slot_level_handler) (struct kvm *kvm,
|
typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
|
||||||
struct kvm_rmap_head *rmap_head,
|
struct kvm_rmap_head *rmap_head,
|
||||||
const struct kvm_memory_slot *slot);
|
const struct kvm_memory_slot *slot);
|
||||||
|
|
||||||
/* The caller should hold mmu-lock before calling this function. */
|
static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
|
||||||
static __always_inline bool
|
const struct kvm_memory_slot *slot,
|
||||||
slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
|
slot_rmaps_handler fn,
|
||||||
slot_level_handler fn, int start_level, int end_level,
|
int start_level, int end_level,
|
||||||
gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
|
gfn_t start_gfn, gfn_t end_gfn,
|
||||||
bool flush)
|
bool flush_on_yield, bool flush)
|
||||||
{
|
{
|
||||||
struct slot_rmap_walk_iterator iterator;
|
struct slot_rmap_walk_iterator iterator;
|
||||||
|
|
||||||
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
|
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||||
|
|
||||||
|
for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
|
||||||
end_gfn, &iterator) {
|
end_gfn, &iterator) {
|
||||||
if (iterator.rmap)
|
if (iterator.rmap)
|
||||||
flush |= fn(kvm, iterator.rmap, memslot);
|
flush |= fn(kvm, iterator.rmap, slot);
|
||||||
|
|
||||||
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
|
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
|
||||||
if (flush && flush_on_yield) {
|
if (flush && flush_on_yield) {
|
||||||
kvm_flush_remote_tlbs_with_address(kvm,
|
kvm_flush_remote_tlbs_range(kvm, start_gfn,
|
||||||
start_gfn,
|
iterator.gfn - start_gfn + 1);
|
||||||
iterator.gfn - start_gfn + 1);
|
|
||||||
flush = false;
|
flush = false;
|
||||||
}
|
}
|
||||||
cond_resched_rwlock_write(&kvm->mmu_lock);
|
cond_resched_rwlock_write(&kvm->mmu_lock);
|
||||||
@ -5861,23 +5952,23 @@ slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
|
|||||||
return flush;
|
return flush;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __always_inline bool
|
static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
|
||||||
slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
|
const struct kvm_memory_slot *slot,
|
||||||
slot_level_handler fn, int start_level, int end_level,
|
slot_rmaps_handler fn,
|
||||||
bool flush_on_yield)
|
int start_level, int end_level,
|
||||||
|
bool flush_on_yield)
|
||||||
{
|
{
|
||||||
return slot_handle_level_range(kvm, memslot, fn, start_level,
|
return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
|
||||||
end_level, memslot->base_gfn,
|
slot->base_gfn, slot->base_gfn + slot->npages - 1,
|
||||||
memslot->base_gfn + memslot->npages - 1,
|
flush_on_yield, false);
|
||||||
flush_on_yield, false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __always_inline bool
|
static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
|
||||||
slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
|
const struct kvm_memory_slot *slot,
|
||||||
slot_level_handler fn, bool flush_on_yield)
|
slot_rmaps_handler fn,
|
||||||
|
bool flush_on_yield)
|
||||||
{
|
{
|
||||||
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
|
return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
|
||||||
PG_LEVEL_4K, flush_on_yield);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void free_mmu_pages(struct kvm_mmu *mmu)
|
static void free_mmu_pages(struct kvm_mmu *mmu)
|
||||||
@ -6172,9 +6263,9 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
|
|||||||
if (WARN_ON_ONCE(start >= end))
|
if (WARN_ON_ONCE(start >= end))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap,
|
flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
|
||||||
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
|
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
|
||||||
start, end - 1, true, flush);
|
start, end - 1, true, flush);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6206,8 +6297,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (flush)
|
if (flush)
|
||||||
kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
|
kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
|
||||||
gfn_end - gfn_start);
|
|
||||||
|
|
||||||
kvm_mmu_invalidate_end(kvm, 0, -1ul);
|
kvm_mmu_invalidate_end(kvm, 0, -1ul);
|
||||||
|
|
||||||
@ -6227,8 +6317,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
|
|||||||
{
|
{
|
||||||
if (kvm_memslots_have_rmaps(kvm)) {
|
if (kvm_memslots_have_rmaps(kvm)) {
|
||||||
write_lock(&kvm->mmu_lock);
|
write_lock(&kvm->mmu_lock);
|
||||||
slot_handle_level(kvm, memslot, slot_rmap_write_protect,
|
walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
|
||||||
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
|
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
|
||||||
write_unlock(&kvm->mmu_lock);
|
write_unlock(&kvm->mmu_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6463,10 +6553,9 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
|
|||||||
* all the way to the target level. There's no need to split pages
|
* all the way to the target level. There's no need to split pages
|
||||||
* already at the target level.
|
* already at the target level.
|
||||||
*/
|
*/
|
||||||
for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
|
for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
|
||||||
slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
|
__walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
|
||||||
level, level, start, end - 1, true, false);
|
level, level, start, end - 1, true, false);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Must be called with the mmu_lock held in write-mode. */
|
/* Must be called with the mmu_lock held in write-mode. */
|
||||||
@ -6545,7 +6634,7 @@ restart:
|
|||||||
PG_LEVEL_NUM)) {
|
PG_LEVEL_NUM)) {
|
||||||
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
|
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
|
||||||
|
|
||||||
if (kvm_available_flush_tlb_with_range())
|
if (kvm_available_flush_remote_tlbs_range())
|
||||||
kvm_flush_remote_tlbs_sptep(kvm, sptep);
|
kvm_flush_remote_tlbs_sptep(kvm, sptep);
|
||||||
else
|
else
|
||||||
need_tlb_flush = 1;
|
need_tlb_flush = 1;
|
||||||
@ -6564,8 +6653,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
|
|||||||
* Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
|
* Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
|
||||||
* pages that are already mapped at the maximum hugepage level.
|
* pages that are already mapped at the maximum hugepage level.
|
||||||
*/
|
*/
|
||||||
if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte,
|
if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
|
||||||
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
|
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
|
||||||
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
|
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6596,8 +6685,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
|
|||||||
* is observed by any other operation on the same memslot.
|
* is observed by any other operation on the same memslot.
|
||||||
*/
|
*/
|
||||||
lockdep_assert_held(&kvm->slots_lock);
|
lockdep_assert_held(&kvm->slots_lock);
|
||||||
kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
|
kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
|
||||||
memslot->npages);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
|
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
|
||||||
@ -6609,7 +6697,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
|
|||||||
* Clear dirty bits only on 4k SPTEs since the legacy MMU only
|
* Clear dirty bits only on 4k SPTEs since the legacy MMU only
|
||||||
* support dirty logging at a 4k granularity.
|
* support dirty logging at a 4k granularity.
|
||||||
*/
|
*/
|
||||||
slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
|
walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
|
||||||
write_unlock(&kvm->mmu_lock);
|
write_unlock(&kvm->mmu_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6679,8 +6767,8 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long
|
static unsigned long mmu_shrink_scan(struct shrinker *shrink,
|
||||||
mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
|
struct shrink_control *sc)
|
||||||
{
|
{
|
||||||
struct kvm *kvm;
|
struct kvm *kvm;
|
||||||
int nr_to_scan = sc->nr_to_scan;
|
int nr_to_scan = sc->nr_to_scan;
|
||||||
@ -6738,8 +6826,8 @@ unlock:
|
|||||||
return freed;
|
return freed;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long
|
static unsigned long mmu_shrink_count(struct shrinker *shrink,
|
||||||
mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
|
struct shrink_control *sc)
|
||||||
{
|
{
|
||||||
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
|
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
|
||||||
}
|
}
|
||||||
|
@ -170,14 +170,14 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
|
|||||||
struct kvm_memory_slot *slot, u64 gfn,
|
struct kvm_memory_slot *slot, u64 gfn,
|
||||||
int min_level);
|
int min_level);
|
||||||
|
|
||||||
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
|
void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
|
||||||
u64 start_gfn, u64 pages);
|
gfn_t nr_pages);
|
||||||
|
|
||||||
/* Flush the given page (huge or not) of guest memory. */
|
/* Flush the given page (huge or not) of guest memory. */
|
||||||
static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
|
static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
|
||||||
{
|
{
|
||||||
kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level),
|
kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level),
|
||||||
KVM_PAGES_PER_HPAGE(level));
|
KVM_PAGES_PER_HPAGE(level));
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
|
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
|
||||||
|
@ -324,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
|
|||||||
trace_kvm_mmu_pagetable_walk(addr, access);
|
trace_kvm_mmu_pagetable_walk(addr, access);
|
||||||
retry_walk:
|
retry_walk:
|
||||||
walker->level = mmu->cpu_role.base.level;
|
walker->level = mmu->cpu_role.base.level;
|
||||||
pte = mmu->get_guest_pgd(vcpu);
|
pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
|
||||||
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
|
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
|
||||||
|
|
||||||
#if PTTYPE == 64
|
#if PTTYPE == 64
|
||||||
@ -519,7 +519,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
|
|||||||
|
|
||||||
static bool
|
static bool
|
||||||
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
||||||
u64 *spte, pt_element_t gpte, bool no_dirty_log)
|
u64 *spte, pt_element_t gpte)
|
||||||
{
|
{
|
||||||
struct kvm_memory_slot *slot;
|
struct kvm_memory_slot *slot;
|
||||||
unsigned pte_access;
|
unsigned pte_access;
|
||||||
@ -535,8 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
|||||||
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
|
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
|
||||||
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
|
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
|
||||||
|
|
||||||
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
|
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
|
||||||
no_dirty_log && (pte_access & ACC_WRITE_MASK));
|
|
||||||
if (!slot)
|
if (!slot)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -605,7 +604,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
|
|||||||
if (is_shadow_present_pte(*spte))
|
if (is_shadow_present_pte(*spte))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
|
if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -846,64 +845,6 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
|
|||||||
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
|
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
|
|
||||||
{
|
|
||||||
struct kvm_shadow_walk_iterator iterator;
|
|
||||||
struct kvm_mmu_page *sp;
|
|
||||||
u64 old_spte;
|
|
||||||
int level;
|
|
||||||
u64 *sptep;
|
|
||||||
|
|
||||||
vcpu_clear_mmio_info(vcpu, gva);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* No need to check return value here, rmap_can_add() can
|
|
||||||
* help us to skip pte prefetch later.
|
|
||||||
*/
|
|
||||||
mmu_topup_memory_caches(vcpu, true);
|
|
||||||
|
|
||||||
if (!VALID_PAGE(root_hpa)) {
|
|
||||||
WARN_ON(1);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
write_lock(&vcpu->kvm->mmu_lock);
|
|
||||||
for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
|
|
||||||
level = iterator.level;
|
|
||||||
sptep = iterator.sptep;
|
|
||||||
|
|
||||||
sp = sptep_to_sp(sptep);
|
|
||||||
old_spte = *sptep;
|
|
||||||
if (is_last_spte(old_spte, level)) {
|
|
||||||
pt_element_t gpte;
|
|
||||||
gpa_t pte_gpa;
|
|
||||||
|
|
||||||
if (!sp->unsync)
|
|
||||||
break;
|
|
||||||
|
|
||||||
pte_gpa = FNAME(get_level1_sp_gpa)(sp);
|
|
||||||
pte_gpa += spte_index(sptep) * sizeof(pt_element_t);
|
|
||||||
|
|
||||||
mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
|
|
||||||
if (is_shadow_present_pte(old_spte))
|
|
||||||
kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
|
|
||||||
|
|
||||||
if (!rmap_can_add(vcpu))
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
|
|
||||||
sizeof(pt_element_t)))
|
|
||||||
break;
|
|
||||||
|
|
||||||
FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!sp->unsync_children)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
write_unlock(&vcpu->kvm->mmu_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
|
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
|
||||||
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||||
gpa_t addr, u64 access,
|
gpa_t addr, u64 access,
|
||||||
@ -936,114 +877,75 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
|||||||
* can't change unless all sptes pointing to it are nuked first.
|
* can't change unless all sptes pointing to it are nuked first.
|
||||||
*
|
*
|
||||||
* Returns
|
* Returns
|
||||||
* < 0: the sp should be zapped
|
* < 0: failed to sync spte
|
||||||
* 0: the sp is synced and no tlb flushing is required
|
* 0: the spte is synced and no tlb flushing is required
|
||||||
* > 0: the sp is synced and tlb flushing is required
|
* > 0: the spte is synced and tlb flushing is required
|
||||||
*/
|
*/
|
||||||
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
|
static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
|
||||||
{
|
{
|
||||||
union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
|
|
||||||
int i;
|
|
||||||
bool host_writable;
|
bool host_writable;
|
||||||
gpa_t first_pte_gpa;
|
gpa_t first_pte_gpa;
|
||||||
bool flush = false;
|
u64 *sptep, spte;
|
||||||
|
struct kvm_memory_slot *slot;
|
||||||
|
unsigned pte_access;
|
||||||
|
pt_element_t gpte;
|
||||||
|
gpa_t pte_gpa;
|
||||||
|
gfn_t gfn;
|
||||||
|
|
||||||
/*
|
if (WARN_ON_ONCE(!sp->spt[i]))
|
||||||
* Ignore various flags when verifying that it's safe to sync a shadow
|
return 0;
|
||||||
* page using the current MMU context.
|
|
||||||
*
|
|
||||||
* - level: not part of the overall MMU role and will never match as the MMU's
|
|
||||||
* level tracks the root level
|
|
||||||
* - access: updated based on the new guest PTE
|
|
||||||
* - quadrant: not part of the overall MMU role (similar to level)
|
|
||||||
*/
|
|
||||||
const union kvm_mmu_page_role sync_role_ign = {
|
|
||||||
.level = 0xf,
|
|
||||||
.access = 0x7,
|
|
||||||
.quadrant = 0x3,
|
|
||||||
.passthrough = 0x1,
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Direct pages can never be unsync, and KVM should never attempt to
|
|
||||||
* sync a shadow page for a different MMU context, e.g. if the role
|
|
||||||
* differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
|
|
||||||
* reserved bits checks will be wrong, etc...
|
|
||||||
*/
|
|
||||||
if (WARN_ON_ONCE(sp->role.direct ||
|
|
||||||
(sp->role.word ^ root_role.word) & ~sync_role_ign.word))
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
|
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
|
||||||
|
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
|
||||||
|
|
||||||
for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
|
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
|
||||||
u64 *sptep, spte;
|
sizeof(pt_element_t)))
|
||||||
struct kvm_memory_slot *slot;
|
return -1;
|
||||||
unsigned pte_access;
|
|
||||||
pt_element_t gpte;
|
|
||||||
gpa_t pte_gpa;
|
|
||||||
gfn_t gfn;
|
|
||||||
|
|
||||||
if (!sp->spt[i])
|
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
|
||||||
continue;
|
return 1;
|
||||||
|
|
||||||
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
|
gfn = gpte_to_gfn(gpte);
|
||||||
|
pte_access = sp->role.access;
|
||||||
|
pte_access &= FNAME(gpte_access)(gpte);
|
||||||
|
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
|
||||||
|
|
||||||
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
|
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
|
||||||
sizeof(pt_element_t)))
|
return 0;
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
|
|
||||||
flush = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
gfn = gpte_to_gfn(gpte);
|
|
||||||
pte_access = sp->role.access;
|
|
||||||
pte_access &= FNAME(gpte_access)(gpte);
|
|
||||||
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
|
|
||||||
|
|
||||||
if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Drop the SPTE if the new protections would result in a RWX=0
|
|
||||||
* SPTE or if the gfn is changing. The RWX=0 case only affects
|
|
||||||
* EPT with execute-only support, i.e. EPT without an effective
|
|
||||||
* "present" bit, as all other paging modes will create a
|
|
||||||
* read-only SPTE if pte_access is zero.
|
|
||||||
*/
|
|
||||||
if ((!pte_access && !shadow_present_mask) ||
|
|
||||||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
|
|
||||||
drop_spte(vcpu->kvm, &sp->spt[i]);
|
|
||||||
flush = true;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Update the shadowed access bits in case they changed. */
|
|
||||||
kvm_mmu_page_set_access(sp, i, pte_access);
|
|
||||||
|
|
||||||
sptep = &sp->spt[i];
|
|
||||||
spte = *sptep;
|
|
||||||
host_writable = spte & shadow_host_writable_mask;
|
|
||||||
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
|
|
||||||
make_spte(vcpu, sp, slot, pte_access, gfn,
|
|
||||||
spte_to_pfn(spte), spte, true, false,
|
|
||||||
host_writable, &spte);
|
|
||||||
|
|
||||||
flush |= mmu_spte_update(sptep, spte);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note, any flush is purely for KVM's correctness, e.g. when dropping
|
* Drop the SPTE if the new protections would result in a RWX=0
|
||||||
* an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
|
* SPTE or if the gfn is changing. The RWX=0 case only affects
|
||||||
* unmap or dirty logging event doesn't fail to flush. The guest is
|
* EPT with execute-only support, i.e. EPT without an effective
|
||||||
* responsible for flushing the TLB to ensure any changes in protection
|
* "present" bit, as all other paging modes will create a
|
||||||
* bits are recognized, i.e. until the guest flushes or page faults on
|
* read-only SPTE if pte_access is zero.
|
||||||
* a relevant address, KVM is architecturally allowed to let vCPUs use
|
|
||||||
* cached translations with the old protection bits.
|
|
||||||
*/
|
*/
|
||||||
return flush;
|
if ((!pte_access && !shadow_present_mask) ||
|
||||||
|
gfn != kvm_mmu_page_get_gfn(sp, i)) {
|
||||||
|
drop_spte(vcpu->kvm, &sp->spt[i]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Do nothing if the permissions are unchanged. The existing SPTE is
|
||||||
|
* still, and prefetch_invalid_gpte() has verified that the A/D bits
|
||||||
|
* are set in the "new" gPTE, i.e. there is no danger of missing an A/D
|
||||||
|
* update due to A/D bits being set in the SPTE but not the gPTE.
|
||||||
|
*/
|
||||||
|
if (kvm_mmu_page_get_access(sp, i) == pte_access)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/* Update the shadowed access bits in case they changed. */
|
||||||
|
kvm_mmu_page_set_access(sp, i, pte_access);
|
||||||
|
|
||||||
|
sptep = &sp->spt[i];
|
||||||
|
spte = *sptep;
|
||||||
|
host_writable = spte & shadow_host_writable_mask;
|
||||||
|
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
|
||||||
|
make_spte(vcpu, sp, slot, pte_access, gfn,
|
||||||
|
spte_to_pfn(spte), spte, true, false,
|
||||||
|
host_writable, &spte);
|
||||||
|
|
||||||
|
return mmu_spte_update(sptep, spte);
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef pt_element_t
|
#undef pt_element_t
|
||||||
|
@ -164,7 +164,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
|
|||||||
/*
|
/*
|
||||||
* For simplicity, enforce the NX huge page mitigation even if not
|
* For simplicity, enforce the NX huge page mitigation even if not
|
||||||
* strictly necessary. KVM could ignore the mitigation if paging is
|
* strictly necessary. KVM could ignore the mitigation if paging is
|
||||||
* disabled in the guest, as the guest doesn't have an page tables to
|
* disabled in the guest, as the guest doesn't have any page tables to
|
||||||
* abuse. But to safely ignore the mitigation, KVM would have to
|
* abuse. But to safely ignore the mitigation, KVM would have to
|
||||||
* ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
|
* ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
|
||||||
* is toggled on, and that's a net negative for performance when TDP is
|
* is toggled on, and that's a net negative for performance when TDP is
|
||||||
|
@ -29,29 +29,49 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
|
|||||||
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
|
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* SPTEs must be modified atomically if they are shadow-present, leaf
|
||||||
|
* SPTEs, and have volatile bits, i.e. has bits that can be set outside
|
||||||
|
* of mmu_lock. The Writable bit can be set by KVM's fast page fault
|
||||||
|
* handler, and Accessed and Dirty bits can be set by the CPU.
|
||||||
|
*
|
||||||
|
* Note, non-leaf SPTEs do have Accessed bits and those bits are
|
||||||
|
* technically volatile, but KVM doesn't consume the Accessed bit of
|
||||||
|
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
|
||||||
|
* logic needs to be reassessed if KVM were to use non-leaf Accessed
|
||||||
|
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
|
||||||
|
*/
|
||||||
|
static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
|
||||||
|
{
|
||||||
|
return is_shadow_present_pte(old_spte) &&
|
||||||
|
is_last_spte(old_spte, level) &&
|
||||||
|
spte_has_volatile_bits(old_spte);
|
||||||
|
}
|
||||||
|
|
||||||
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
|
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
|
||||||
u64 new_spte, int level)
|
u64 new_spte, int level)
|
||||||
{
|
{
|
||||||
/*
|
if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
|
||||||
* Atomically write the SPTE if it is a shadow-present, leaf SPTE with
|
|
||||||
* volatile bits, i.e. has bits that can be set outside of mmu_lock.
|
|
||||||
* The Writable bit can be set by KVM's fast page fault handler, and
|
|
||||||
* Accessed and Dirty bits can be set by the CPU.
|
|
||||||
*
|
|
||||||
* Note, non-leaf SPTEs do have Accessed bits and those bits are
|
|
||||||
* technically volatile, but KVM doesn't consume the Accessed bit of
|
|
||||||
* non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
|
|
||||||
* logic needs to be reassessed if KVM were to use non-leaf Accessed
|
|
||||||
* bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
|
|
||||||
*/
|
|
||||||
if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
|
|
||||||
spte_has_volatile_bits(old_spte))
|
|
||||||
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
|
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
|
||||||
|
|
||||||
__kvm_tdp_mmu_write_spte(sptep, new_spte);
|
__kvm_tdp_mmu_write_spte(sptep, new_spte);
|
||||||
return old_spte;
|
return old_spte;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
|
||||||
|
u64 mask, int level)
|
||||||
|
{
|
||||||
|
atomic64_t *sptep_atomic;
|
||||||
|
|
||||||
|
if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
|
||||||
|
sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
|
||||||
|
return (u64)atomic64_fetch_and(~mask, sptep_atomic);
|
||||||
|
}
|
||||||
|
|
||||||
|
__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
|
||||||
|
return old_spte;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A TDP iterator performs a pre-order walk over a TDP paging structure.
|
* A TDP iterator performs a pre-order walk over a TDP paging structure.
|
||||||
*/
|
*/
|
||||||
|
@ -334,35 +334,6 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
|
|||||||
u64 old_spte, u64 new_spte, int level,
|
u64 old_spte, u64 new_spte, int level,
|
||||||
bool shared);
|
bool shared);
|
||||||
|
|
||||||
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
|
|
||||||
{
|
|
||||||
if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (is_accessed_spte(old_spte) &&
|
|
||||||
(!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
|
|
||||||
spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
|
|
||||||
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
|
|
||||||
u64 old_spte, u64 new_spte, int level)
|
|
||||||
{
|
|
||||||
bool pfn_changed;
|
|
||||||
struct kvm_memory_slot *slot;
|
|
||||||
|
|
||||||
if (level > PG_LEVEL_4K)
|
|
||||||
return;
|
|
||||||
|
|
||||||
pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
|
|
||||||
|
|
||||||
if ((!is_writable_pte(old_spte) || pfn_changed) &&
|
|
||||||
is_writable_pte(new_spte)) {
|
|
||||||
slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
|
|
||||||
mark_page_dirty_in_slot(kvm, slot, gfn);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
|
||||||
{
|
{
|
||||||
kvm_account_pgtable_pages((void *)sp->spt, +1);
|
kvm_account_pgtable_pages((void *)sp->spt, +1);
|
||||||
@ -505,7 +476,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* __handle_changed_spte - handle bookkeeping associated with an SPTE change
|
* handle_changed_spte - handle bookkeeping associated with an SPTE change
|
||||||
* @kvm: kvm instance
|
* @kvm: kvm instance
|
||||||
* @as_id: the address space of the paging structure the SPTE was a part of
|
* @as_id: the address space of the paging structure the SPTE was a part of
|
||||||
* @gfn: the base GFN that was mapped by the SPTE
|
* @gfn: the base GFN that was mapped by the SPTE
|
||||||
@ -516,12 +487,13 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
|
|||||||
* the MMU lock and the operation must synchronize with other
|
* the MMU lock and the operation must synchronize with other
|
||||||
* threads that might be modifying SPTEs.
|
* threads that might be modifying SPTEs.
|
||||||
*
|
*
|
||||||
* Handle bookkeeping that might result from the modification of a SPTE.
|
* Handle bookkeeping that might result from the modification of a SPTE. Note,
|
||||||
* This function must be called for all TDP SPTE modifications.
|
* dirty logging updates are handled in common code, not here (see make_spte()
|
||||||
|
* and fast_pf_fix_direct_spte()).
|
||||||
*/
|
*/
|
||||||
static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
|
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
|
||||||
u64 old_spte, u64 new_spte, int level,
|
u64 old_spte, u64 new_spte, int level,
|
||||||
bool shared)
|
bool shared)
|
||||||
{
|
{
|
||||||
bool was_present = is_shadow_present_pte(old_spte);
|
bool was_present = is_shadow_present_pte(old_spte);
|
||||||
bool is_present = is_shadow_present_pte(new_spte);
|
bool is_present = is_shadow_present_pte(new_spte);
|
||||||
@ -605,17 +577,10 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
|
|||||||
if (was_present && !was_leaf &&
|
if (was_present && !was_leaf &&
|
||||||
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
|
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
|
||||||
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
|
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
|
||||||
}
|
|
||||||
|
|
||||||
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
|
if (was_leaf && is_accessed_spte(old_spte) &&
|
||||||
u64 old_spte, u64 new_spte, int level,
|
(!is_present || !is_accessed_spte(new_spte) || pfn_changed))
|
||||||
bool shared)
|
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
|
||||||
{
|
|
||||||
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
|
|
||||||
shared);
|
|
||||||
handle_changed_spte_acc_track(old_spte, new_spte, level);
|
|
||||||
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
|
|
||||||
new_spte, level);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -658,9 +623,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
|
|||||||
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
|
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
|
|
||||||
__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
|
handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
|
||||||
new_spte, iter->level, true);
|
new_spte, iter->level, true);
|
||||||
handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -696,7 +660,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
|
* tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
|
||||||
* @kvm: KVM instance
|
* @kvm: KVM instance
|
||||||
* @as_id: Address space ID, i.e. regular vs. SMM
|
* @as_id: Address space ID, i.e. regular vs. SMM
|
||||||
* @sptep: Pointer to the SPTE
|
* @sptep: Pointer to the SPTE
|
||||||
@ -704,23 +668,12 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
|
|||||||
* @new_spte: The new value that will be set for the SPTE
|
* @new_spte: The new value that will be set for the SPTE
|
||||||
* @gfn: The base GFN that was (or will be) mapped by the SPTE
|
* @gfn: The base GFN that was (or will be) mapped by the SPTE
|
||||||
* @level: The level _containing_ the SPTE (its parent PT's level)
|
* @level: The level _containing_ the SPTE (its parent PT's level)
|
||||||
* @record_acc_track: Notify the MM subsystem of changes to the accessed state
|
|
||||||
* of the page. Should be set unless handling an MMU
|
|
||||||
* notifier for access tracking. Leaving record_acc_track
|
|
||||||
* unset in that case prevents page accesses from being
|
|
||||||
* double counted.
|
|
||||||
* @record_dirty_log: Record the page as dirty in the dirty bitmap if
|
|
||||||
* appropriate for the change being made. Should be set
|
|
||||||
* unless performing certain dirty logging operations.
|
|
||||||
* Leaving record_dirty_log unset in that case prevents page
|
|
||||||
* writes from being double counted.
|
|
||||||
*
|
*
|
||||||
* Returns the old SPTE value, which _may_ be different than @old_spte if the
|
* Returns the old SPTE value, which _may_ be different than @old_spte if the
|
||||||
* SPTE had voldatile bits.
|
* SPTE had voldatile bits.
|
||||||
*/
|
*/
|
||||||
static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
|
static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
|
||||||
u64 old_spte, u64 new_spte, gfn_t gfn, int level,
|
u64 old_spte, u64 new_spte, gfn_t gfn, int level)
|
||||||
bool record_acc_track, bool record_dirty_log)
|
|
||||||
{
|
{
|
||||||
lockdep_assert_held_write(&kvm->mmu_lock);
|
lockdep_assert_held_write(&kvm->mmu_lock);
|
||||||
|
|
||||||
@ -735,46 +688,17 @@ static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
|
|||||||
|
|
||||||
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
|
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
|
||||||
|
|
||||||
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
|
handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
|
||||||
|
|
||||||
if (record_acc_track)
|
|
||||||
handle_changed_spte_acc_track(old_spte, new_spte, level);
|
|
||||||
if (record_dirty_log)
|
|
||||||
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
|
|
||||||
new_spte, level);
|
|
||||||
return old_spte;
|
return old_spte;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
||||||
u64 new_spte, bool record_acc_track,
|
u64 new_spte)
|
||||||
bool record_dirty_log)
|
|
||||||
{
|
{
|
||||||
WARN_ON_ONCE(iter->yielded);
|
WARN_ON_ONCE(iter->yielded);
|
||||||
|
iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
|
||||||
iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
|
iter->old_spte, new_spte,
|
||||||
iter->old_spte, new_spte,
|
iter->gfn, iter->level);
|
||||||
iter->gfn, iter->level,
|
|
||||||
record_acc_track, record_dirty_log);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
|
|
||||||
u64 new_spte)
|
|
||||||
{
|
|
||||||
_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
|
|
||||||
struct tdp_iter *iter,
|
|
||||||
u64 new_spte)
|
|
||||||
{
|
|
||||||
_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
|
|
||||||
struct tdp_iter *iter,
|
|
||||||
u64 new_spte)
|
|
||||||
{
|
|
||||||
_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
|
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
|
||||||
@ -866,7 +790,7 @@ retry:
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (!shared)
|
if (!shared)
|
||||||
tdp_mmu_set_spte(kvm, &iter, 0);
|
tdp_mmu_iter_set_spte(kvm, &iter, 0);
|
||||||
else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
|
else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
@ -923,8 +847,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
|
|||||||
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
|
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
|
tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
|
||||||
sp->gfn, sp->role.level + 1, true, true);
|
sp->gfn, sp->role.level + 1);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -958,7 +882,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||||||
!is_last_spte(iter.old_spte, iter.level))
|
!is_last_spte(iter.old_spte, iter.level))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
tdp_mmu_set_spte(kvm, &iter, 0);
|
tdp_mmu_iter_set_spte(kvm, &iter, 0);
|
||||||
flush = true;
|
flush = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1128,7 +1052,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
|
|||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
} else {
|
} else {
|
||||||
tdp_mmu_set_spte(kvm, iter, spte);
|
tdp_mmu_iter_set_spte(kvm, iter, spte);
|
||||||
}
|
}
|
||||||
|
|
||||||
tdp_account_mmu_page(kvm, sp);
|
tdp_account_mmu_page(kvm, sp);
|
||||||
@ -1262,33 +1186,42 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
|
|||||||
/*
|
/*
|
||||||
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
|
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
|
||||||
* if any of the GFNs in the range have been accessed.
|
* if any of the GFNs in the range have been accessed.
|
||||||
|
*
|
||||||
|
* No need to mark the corresponding PFN as accessed as this call is coming
|
||||||
|
* from the clear_young() or clear_flush_young() notifier, which uses the
|
||||||
|
* return value to determine if the page has been accessed.
|
||||||
*/
|
*/
|
||||||
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
|
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
|
||||||
struct kvm_gfn_range *range)
|
struct kvm_gfn_range *range)
|
||||||
{
|
{
|
||||||
u64 new_spte = 0;
|
u64 new_spte;
|
||||||
|
|
||||||
/* If we have a non-accessed entry we don't need to change the pte. */
|
/* If we have a non-accessed entry we don't need to change the pte. */
|
||||||
if (!is_accessed_spte(iter->old_spte))
|
if (!is_accessed_spte(iter->old_spte))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
new_spte = iter->old_spte;
|
if (spte_ad_enabled(iter->old_spte)) {
|
||||||
|
iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
|
||||||
if (spte_ad_enabled(new_spte)) {
|
iter->old_spte,
|
||||||
new_spte &= ~shadow_accessed_mask;
|
shadow_accessed_mask,
|
||||||
|
iter->level);
|
||||||
|
new_spte = iter->old_spte & ~shadow_accessed_mask;
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* Capture the dirty status of the page, so that it doesn't get
|
* Capture the dirty status of the page, so that it doesn't get
|
||||||
* lost when the SPTE is marked for access tracking.
|
* lost when the SPTE is marked for access tracking.
|
||||||
*/
|
*/
|
||||||
if (is_writable_pte(new_spte))
|
if (is_writable_pte(iter->old_spte))
|
||||||
kvm_set_pfn_dirty(spte_to_pfn(new_spte));
|
kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
|
||||||
|
|
||||||
new_spte = mark_spte_for_access_track(new_spte);
|
new_spte = mark_spte_for_access_track(iter->old_spte);
|
||||||
|
iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
|
||||||
|
iter->old_spte, new_spte,
|
||||||
|
iter->level);
|
||||||
}
|
}
|
||||||
|
|
||||||
tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
|
trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
|
||||||
|
iter->old_spte, new_spte);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1324,15 +1257,15 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
|
|||||||
* Note, when changing a read-only SPTE, it's not strictly necessary to
|
* Note, when changing a read-only SPTE, it's not strictly necessary to
|
||||||
* zero the SPTE before setting the new PFN, but doing so preserves the
|
* zero the SPTE before setting the new PFN, but doing so preserves the
|
||||||
* invariant that the PFN of a present * leaf SPTE can never change.
|
* invariant that the PFN of a present * leaf SPTE can never change.
|
||||||
* See __handle_changed_spte().
|
* See handle_changed_spte().
|
||||||
*/
|
*/
|
||||||
tdp_mmu_set_spte(kvm, iter, 0);
|
tdp_mmu_iter_set_spte(kvm, iter, 0);
|
||||||
|
|
||||||
if (!pte_write(range->pte)) {
|
if (!pte_write(range->pte)) {
|
||||||
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
|
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
|
||||||
pte_pfn(range->pte));
|
pte_pfn(range->pte));
|
||||||
|
|
||||||
tdp_mmu_set_spte(kvm, iter, new_spte);
|
tdp_mmu_iter_set_spte(kvm, iter, new_spte);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -1349,7 +1282,7 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
|
|||||||
/*
|
/*
|
||||||
* No need to handle the remote TLB flush under RCU protection, the
|
* No need to handle the remote TLB flush under RCU protection, the
|
||||||
* target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
|
* target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
|
||||||
* shadow page. See the WARN on pfn_changed in __handle_changed_spte().
|
* shadow page. See the WARN on pfn_changed in handle_changed_spte().
|
||||||
*/
|
*/
|
||||||
return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
|
return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
|
||||||
}
|
}
|
||||||
@ -1607,8 +1540,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
|
|||||||
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
|
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||||
gfn_t start, gfn_t end)
|
gfn_t start, gfn_t end)
|
||||||
{
|
{
|
||||||
|
u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
|
||||||
struct tdp_iter iter;
|
struct tdp_iter iter;
|
||||||
u64 new_spte;
|
|
||||||
bool spte_set = false;
|
bool spte_set = false;
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
@ -1621,19 +1554,13 @@ retry:
|
|||||||
if (!is_shadow_present_pte(iter.old_spte))
|
if (!is_shadow_present_pte(iter.old_spte))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (spte_ad_need_write_protect(iter.old_spte)) {
|
MMU_WARN_ON(kvm_ad_enabled() &&
|
||||||
if (is_writable_pte(iter.old_spte))
|
spte_ad_need_write_protect(iter.old_spte));
|
||||||
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
|
|
||||||
else
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
if (iter.old_spte & shadow_dirty_mask)
|
|
||||||
new_spte = iter.old_spte & ~shadow_dirty_mask;
|
|
||||||
else
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
|
if (!(iter.old_spte & dbit))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
|
||||||
goto retry;
|
goto retry;
|
||||||
|
|
||||||
spte_set = true;
|
spte_set = true;
|
||||||
@ -1675,8 +1602,9 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
|
|||||||
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
|
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
|
||||||
gfn_t gfn, unsigned long mask, bool wrprot)
|
gfn_t gfn, unsigned long mask, bool wrprot)
|
||||||
{
|
{
|
||||||
|
u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
|
||||||
|
shadow_dirty_mask;
|
||||||
struct tdp_iter iter;
|
struct tdp_iter iter;
|
||||||
u64 new_spte;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
|
|
||||||
@ -1685,25 +1613,26 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||||||
if (!mask)
|
if (!mask)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
MMU_WARN_ON(kvm_ad_enabled() &&
|
||||||
|
spte_ad_need_write_protect(iter.old_spte));
|
||||||
|
|
||||||
if (iter.level > PG_LEVEL_4K ||
|
if (iter.level > PG_LEVEL_4K ||
|
||||||
!(mask & (1UL << (iter.gfn - gfn))))
|
!(mask & (1UL << (iter.gfn - gfn))))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
mask &= ~(1UL << (iter.gfn - gfn));
|
mask &= ~(1UL << (iter.gfn - gfn));
|
||||||
|
|
||||||
if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
|
if (!(iter.old_spte & dbit))
|
||||||
if (is_writable_pte(iter.old_spte))
|
continue;
|
||||||
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
|
|
||||||
else
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
if (iter.old_spte & shadow_dirty_mask)
|
|
||||||
new_spte = iter.old_spte & ~shadow_dirty_mask;
|
|
||||||
else
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
|
iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
|
||||||
|
iter.old_spte, dbit,
|
||||||
|
iter.level);
|
||||||
|
|
||||||
|
trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
|
||||||
|
iter.old_spte,
|
||||||
|
iter.old_spte & ~dbit);
|
||||||
|
kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
|
||||||
}
|
}
|
||||||
|
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
@ -1821,7 +1750,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
|
|||||||
if (new_spte == iter.old_spte)
|
if (new_spte == iter.old_spte)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
tdp_mmu_set_spte(kvm, &iter, new_spte);
|
tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
|
||||||
spte_set = true;
|
spte_set = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,9 +35,8 @@ static inline __init void svm_hv_hardware_setup(void)
|
|||||||
if (npt_enabled &&
|
if (npt_enabled &&
|
||||||
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
|
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
|
||||||
pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
|
pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
|
||||||
svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
|
svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
|
||||||
svm_x86_ops.tlb_remote_flush_with_range =
|
svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
|
||||||
hv_remote_flush_tlb_with_range;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
|
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
|
||||||
|
@ -358,6 +358,7 @@ static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
|
|||||||
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
|
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
|
||||||
gpa_t addr)
|
gpa_t addr)
|
||||||
{
|
{
|
||||||
|
unsigned long roots = 0;
|
||||||
uint i;
|
uint i;
|
||||||
struct kvm_mmu_root_info *cached_root;
|
struct kvm_mmu_root_info *cached_root;
|
||||||
|
|
||||||
@ -368,8 +369,10 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
|
|||||||
|
|
||||||
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
|
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
|
||||||
eptp))
|
eptp))
|
||||||
vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
|
roots |= KVM_MMU_ROOT_PREVIOUS(i);
|
||||||
}
|
}
|
||||||
|
if (roots)
|
||||||
|
kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
|
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
|
||||||
|
@ -8395,9 +8395,8 @@ static __init int hardware_setup(void)
|
|||||||
#if IS_ENABLED(CONFIG_HYPERV)
|
#if IS_ENABLED(CONFIG_HYPERV)
|
||||||
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
|
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
|
||||||
&& enable_ept) {
|
&& enable_ept) {
|
||||||
vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
|
vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
|
||||||
vmx_x86_ops.tlb_remote_flush_with_range =
|
vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
|
||||||
hv_remote_flush_tlb_with_range;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -802,8 +802,8 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
|
|||||||
*/
|
*/
|
||||||
if ((fault->error_code & PFERR_PRESENT_MASK) &&
|
if ((fault->error_code & PFERR_PRESENT_MASK) &&
|
||||||
!(fault->error_code & PFERR_RSVD_MASK))
|
!(fault->error_code & PFERR_RSVD_MASK))
|
||||||
kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
|
kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
|
||||||
fault_mmu->root.hpa);
|
KVM_MMU_ROOT_CURRENT);
|
||||||
|
|
||||||
fault_mmu->inject_page_fault(vcpu, fault);
|
fault_mmu->inject_page_fault(vcpu, fault);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user