2020-10-15 02:26:43 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#ifndef __KVM_X86_MMU_TDP_MMU_H
|
|
|
|
#define __KVM_X86_MMU_TDP_MMU_H
|
|
|
|
|
|
|
|
#include <linux/kvm_host.h>
|
|
|
|
|
2020-10-15 02:26:44 +08:00
|
|
|
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
|
|
|
|
|
2022-01-25 17:58:54 +08:00
|
|
|
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
|
2021-04-02 07:37:25 +08:00
|
|
|
{
|
2021-04-02 07:37:30 +08:00
|
|
|
return refcount_inc_not_zero(&root->tdp_mmu_root_count);
|
2021-04-02 07:37:25 +08:00
|
|
|
}
|
|
|
|
|
2021-04-02 07:37:32 +08:00
|
|
|
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
|
|
|
|
bool shared);
|
2021-04-02 07:37:25 +08:00
|
|
|
|
KVM: x86/mmu: Zap only TDP MMU leafs in zap range and mmu_notifier unmap
Re-introduce zapping only leaf SPTEs in kvm_zap_gfn_range() and
kvm_tdp_mmu_unmap_gfn_range(), this time without losing a pending TLB
flush when processing multiple roots (including nested TDP shadow roots).
Dropping the TLB flush resulted in random crashes when running Hyper-V
Server 2019 in a guest with KSM enabled in the host (or any source of
mmu_notifier invalidations, KSM is just the easiest to force).
This effectively revert commits 873dd122172f8cce329113cfb0dfe3d2344d80c0
and fcb93eb6d09dd302cbef22bd95a5858af75e4156, and thus restores commit
cf3e26427c08ad9015956293ab389004ac6a338e, plus this delta on top:
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
struct kvm_mmu_page *root;
for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
- flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, false);
+ flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
return flush;
}
Cc: Ben Gardon <bgardon@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Tested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Message-Id: <20220325230348.2587437-1-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-03-26 07:03:48 +08:00
|
|
|
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start,
|
2021-08-11 04:52:36 +08:00
|
|
|
gfn_t end, bool can_yield, bool flush);
|
2022-02-26 08:15:31 +08:00
|
|
|
bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp);
|
2020-10-15 02:26:47 +08:00
|
|
|
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
|
2021-04-02 07:37:35 +08:00
|
|
|
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
|
2021-04-02 07:37:36 +08:00
|
|
|
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
|
2020-10-15 02:26:50 +08:00
|
|
|
|
2021-08-06 16:35:50 +08:00
|
|
|
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
|
2020-10-15 02:26:52 +08:00
|
|
|
|
KVM: Move x86's MMU notifier memslot walkers to generic code
Move the hva->gfn lookup for MMU notifiers into common code. Every arch
does a similar lookup, and some arch code is all but identical across
multiple architectures.
In addition to consolidating code, this will allow introducing
optimizations that will benefit all architectures without incurring
multiple walks of the memslots, e.g. by taking mmu_lock if and only if a
relevant range exists in the memslots.
The use of __always_inline to avoid indirect call retpolines, as done by
x86, may also benefit other architectures.
Consolidating the lookups also fixes a wart in x86, where the legacy MMU
and TDP MMU each do their own memslot walks.
Lastly, future enhancements to the memslot implementation, e.g. to add an
interval tree to track host address, will need to touch far less arch
specific code.
MIPS, PPC, and arm64 will be converted one at a time in future patches.
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20210402005658.3024832-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-02 08:56:50 +08:00
|
|
|
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
|
|
|
|
bool flush);
|
|
|
|
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
|
|
|
|
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
|
|
|
|
bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
|
2020-10-15 02:26:55 +08:00
|
|
|
|
2021-07-13 10:33:38 +08:00
|
|
|
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
|
|
|
|
const struct kvm_memory_slot *slot, int min_level);
|
2020-10-15 02:26:55 +08:00
|
|
|
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
|
2021-07-13 10:33:38 +08:00
|
|
|
const struct kvm_memory_slot *slot);
|
2020-10-15 02:26:55 +08:00
|
|
|
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
|
|
|
|
struct kvm_memory_slot *slot,
|
|
|
|
gfn_t gfn, unsigned long mask,
|
|
|
|
bool wrprot);
|
2021-11-20 12:50:21 +08:00
|
|
|
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
|
|
|
|
const struct kvm_memory_slot *slot);
|
2020-10-15 02:26:57 +08:00
|
|
|
|
|
|
|
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
|
2021-04-29 11:41:14 +08:00
|
|
|
struct kvm_memory_slot *slot, gfn_t gfn,
|
|
|
|
int min_level);
|
2020-10-15 02:26:58 +08:00
|
|
|
|
KVM: x86/mmu: Split huge pages mapped by the TDP MMU when dirty logging is enabled
When dirty logging is enabled without initially-all-set, try to split
all huge pages in the memslot down to 4KB pages so that vCPUs do not
have to take expensive write-protection faults to split huge pages.
Eager page splitting is best-effort only. This commit only adds the
support for the TDP MMU, and even there splitting may fail due to out
of memory conditions. Failures to split a huge page is fine from a
correctness standpoint because KVM will always follow up splitting by
write-protecting any remaining huge pages.
Eager page splitting moves the cost of splitting huge pages off of the
vCPU threads and onto the thread enabling dirty logging on the memslot.
This is useful because:
1. Splitting on the vCPU thread interrupts vCPUs execution and is
disruptive to customers whereas splitting on VM ioctl threads can
run in parallel with vCPU execution.
2. Splitting all huge pages at once is more efficient because it does
not require performing VM-exit handling or walking the page table for
every 4KiB page in the memslot, and greatly reduces the amount of
contention on the mmu_lock.
For example, when running dirty_log_perf_test with 96 virtual CPUs, 1GiB
per vCPU, and 1GiB HugeTLB memory, the time it takes vCPUs to write to
all of their memory after dirty logging is enabled decreased by 95% from
2.94s to 0.14s.
Eager Page Splitting is over 100x more efficient than the current
implementation of splitting on fault under the read lock. For example,
taking the same workload as above, Eager Page Splitting reduced the CPU
required to split all huge pages from ~270 CPU-seconds ((2.94s - 0.14s)
* 96 vCPU threads) to only 1.55 CPU-seconds.
Eager page splitting does increase the amount of time it takes to enable
dirty logging since it has split all huge pages. For example, the time
it took to enable dirty logging in the 96GiB region of the
aforementioned test increased from 0.001s to 1.55s.
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220119230739.2234394-16-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-01-20 07:07:36 +08:00
|
|
|
void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
|
|
|
|
const struct kvm_memory_slot *slot,
|
|
|
|
gfn_t start, gfn_t end,
|
2022-01-20 07:07:37 +08:00
|
|
|
int target_level, bool shared);
|
KVM: x86/mmu: Split huge pages mapped by the TDP MMU when dirty logging is enabled
When dirty logging is enabled without initially-all-set, try to split
all huge pages in the memslot down to 4KB pages so that vCPUs do not
have to take expensive write-protection faults to split huge pages.
Eager page splitting is best-effort only. This commit only adds the
support for the TDP MMU, and even there splitting may fail due to out
of memory conditions. Failures to split a huge page is fine from a
correctness standpoint because KVM will always follow up splitting by
write-protecting any remaining huge pages.
Eager page splitting moves the cost of splitting huge pages off of the
vCPU threads and onto the thread enabling dirty logging on the memslot.
This is useful because:
1. Splitting on the vCPU thread interrupts vCPUs execution and is
disruptive to customers whereas splitting on VM ioctl threads can
run in parallel with vCPU execution.
2. Splitting all huge pages at once is more efficient because it does
not require performing VM-exit handling or walking the page table for
every 4KiB page in the memslot, and greatly reduces the amount of
contention on the mmu_lock.
For example, when running dirty_log_perf_test with 96 virtual CPUs, 1GiB
per vCPU, and 1GiB HugeTLB memory, the time it takes vCPUs to write to
all of their memory after dirty logging is enabled decreased by 95% from
2.94s to 0.14s.
Eager Page Splitting is over 100x more efficient than the current
implementation of splitting on fault under the read lock. For example,
taking the same workload as above, Eager Page Splitting reduced the CPU
required to split all huge pages from ~270 CPU-seconds ((2.94s - 0.14s)
* 96 vCPU threads) to only 1.55 CPU-seconds.
Eager page splitting does increase the amount of time it takes to enable
dirty logging since it has split all huge pages. For example, the time
it took to enable dirty logging in the 96GiB region of the
aforementioned test increased from 0.001s to 1.55s.
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220119230739.2234394-16-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-01-20 07:07:36 +08:00
|
|
|
|
2021-07-14 06:09:54 +08:00
|
|
|
static inline void kvm_tdp_mmu_walk_lockless_begin(void)
|
|
|
|
{
|
|
|
|
rcu_read_lock();
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void kvm_tdp_mmu_walk_lockless_end(void)
|
|
|
|
{
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2020-12-18 08:31:37 +08:00
|
|
|
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
|
|
|
|
int *root_level);
|
2021-07-14 06:09:55 +08:00
|
|
|
u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
|
|
|
|
u64 *spte);
|
2020-12-18 08:31:37 +08:00
|
|
|
|
2021-02-06 22:53:33 +08:00
|
|
|
#ifdef CONFIG_X86_64
|
2022-03-26 00:42:52 +08:00
|
|
|
int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
|
2021-02-06 22:53:33 +08:00
|
|
|
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
|
|
|
|
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
|
|
|
|
|
2021-06-18 07:19:47 +08:00
|
|
|
static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
|
2021-02-06 22:53:33 +08:00
|
|
|
{
|
|
|
|
struct kvm_mmu_page *sp;
|
2022-02-21 22:28:33 +08:00
|
|
|
hpa_t hpa = mmu->root.hpa;
|
2021-02-06 22:53:33 +08:00
|
|
|
|
|
|
|
if (WARN_ON(!VALID_PAGE(hpa)))
|
|
|
|
return false;
|
|
|
|
|
2021-06-22 15:24:54 +08:00
|
|
|
/*
|
|
|
|
* A NULL shadow page is legal when shadowing a non-paging guest with
|
|
|
|
* PAE paging, as the MMU will be direct with root_hpa pointing at the
|
|
|
|
* pae_root page, not a shadow page.
|
|
|
|
*/
|
2021-02-06 22:53:33 +08:00
|
|
|
sp = to_shadow_page(hpa);
|
2021-06-22 15:24:54 +08:00
|
|
|
return sp && is_tdp_mmu_page(sp) && sp->root_count;
|
2021-02-06 22:53:33 +08:00
|
|
|
}
|
2021-06-18 18:42:10 +08:00
|
|
|
#else
|
2022-03-26 00:42:52 +08:00
|
|
|
static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; }
|
2021-06-18 18:42:10 +08:00
|
|
|
static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
|
|
|
|
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
|
2021-06-18 07:19:47 +08:00
|
|
|
static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
|
2021-06-18 18:42:10 +08:00
|
|
|
#endif
|
2021-02-06 22:53:33 +08:00
|
|
|
|
2020-10-15 02:26:43 +08:00
|
|
|
#endif /* __KVM_X86_MMU_TDP_MMU_H */
|