linux/virt/kvm/dirty_ring.c
Peter Xu fb04a1eddb KVM: X86: Implement ring-based dirty memory tracking
This patch is heavily based on previous work from Lei Cao
<lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]

KVM currently uses large bitmaps to track dirty memory.  These bitmaps
are copied to userspace when userspace queries KVM for its dirty page
information.  The use of bitmaps is mostly sufficient for live
migration, as large parts of memory are be dirtied from one log-dirty
pass to another.  However, in a checkpointing system, the number of
dirty pages is small and in fact it is often bounded---the VM is
paused when it has dirtied a pre-defined number of pages. Traversing a
large, sparsely populated bitmap to find set bits is time-consuming,
as is copying the bitmap to user-space.

A similar issue will be there for live migration when the guest memory
is huge while the page dirty procedure is trivial.  In that case for
each dirty sync we need to pull the whole dirty bitmap to userspace
and analyse every bit even if it's mostly zeros.

The preferred data structure for above scenarios is a dense list of
guest frame numbers (GFN).  This patch series stores the dirty list in
kernel memory that can be memory mapped into userspace to allow speedy
harvesting.

This patch enables dirty ring for X86 only.  However it should be
easily extended to other archs as well.

[1] https://patchwork.kernel.org/patch/10471409/

Signed-off-by: Lei Cao <lei.cao@stratus.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20201001012222.5767-1-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-11-15 09:49:15 -05:00

195 lines
4.4 KiB
C

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* KVM dirty ring implementation
*
* Copyright 2019 Red Hat, Inc.
*/
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/vmalloc.h>
#include <linux/kvm_dirty_ring.h>
#include <trace/events/kvm.h>
int __weak kvm_cpu_dirty_log_size(void)
{
return 0;
}
u32 kvm_dirty_ring_get_rsvd_entries(void)
{
return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size();
}
static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
{
return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
}
bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
{
return kvm_dirty_ring_used(ring) >= ring->soft_limit;
}
static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring)
{
return kvm_dirty_ring_used(ring) >= ring->size;
}
struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
WARN_ON_ONCE(vcpu->kvm != kvm);
return &vcpu->dirty_ring;
}
static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
{
struct kvm_memory_slot *memslot;
int as_id, id;
as_id = slot >> 16;
id = (u16)slot;
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
return;
memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
if (!memslot || (offset + __fls(mask)) >= memslot->npages)
return;
spin_lock(&kvm->mmu_lock);
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
spin_unlock(&kvm->mmu_lock);
}
int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size)
{
ring->dirty_gfns = vmalloc(size);
if (!ring->dirty_gfns)
return -ENOMEM;
memset(ring->dirty_gfns, 0, size);
ring->size = size / sizeof(struct kvm_dirty_gfn);
ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
ring->dirty_index = 0;
ring->reset_index = 0;
ring->index = index;
return 0;
}
static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn)
{
gfn->flags = 0;
}
static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
{
gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
}
static inline bool kvm_dirty_gfn_invalid(struct kvm_dirty_gfn *gfn)
{
return gfn->flags == 0;
}
static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
{
return gfn->flags & KVM_DIRTY_GFN_F_RESET;
}
int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
{
u32 cur_slot, next_slot;
u64 cur_offset, next_offset;
unsigned long mask;
int count = 0;
struct kvm_dirty_gfn *entry;
bool first_round = true;
/* This is only needed to make compilers happy */
cur_slot = cur_offset = mask = 0;
while (true) {
entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
if (!kvm_dirty_gfn_harvested(entry))
break;
next_slot = READ_ONCE(entry->slot);
next_offset = READ_ONCE(entry->offset);
/* Update the flags to reflect that this GFN is reset */
kvm_dirty_gfn_set_invalid(entry);
ring->reset_index++;
count++;
/*
* Try to coalesce the reset operations when the guest is
* scanning pages in the same slot.
*/
if (!first_round && next_slot == cur_slot) {
s64 delta = next_offset - cur_offset;
if (delta >= 0 && delta < BITS_PER_LONG) {
mask |= 1ull << delta;
continue;
}
/* Backwards visit, careful about overflows! */
if (delta > -BITS_PER_LONG && delta < 0 &&
(mask << -delta >> -delta) == mask) {
cur_offset = next_offset;
mask = (mask << -delta) | 1;
continue;
}
}
kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
cur_slot = next_slot;
cur_offset = next_offset;
mask = 1;
first_round = false;
}
kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
trace_kvm_dirty_ring_reset(ring);
return count;
}
void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
{
struct kvm_dirty_gfn *entry;
/* It should never get full */
WARN_ON_ONCE(kvm_dirty_ring_full(ring));
entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
entry->slot = slot;
entry->offset = offset;
/*
* Make sure the data is filled in before we publish this to
* the userspace program. There's no paired kernel-side reader.
*/
smp_wmb();
kvm_dirty_gfn_set_dirtied(entry);
ring->dirty_index++;
trace_kvm_dirty_ring_push(ring, slot, offset);
}
struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
{
return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE);
}
void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
{
vfree(ring->dirty_gfns);
ring->dirty_gfns = NULL;
}