mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2025-01-07 14:24:18 +08:00
e0b7ec058c
On a threaded processor such as POWER7, we group VCPUs into virtual cores and arrange that the VCPUs in a virtual core run on the same physical core. Currently we don't enforce any correspondence between virtual thread numbers within a virtual core and physical thread numbers. Physical threads are allocated starting at 0 on a first-come first-served basis to runnable virtual threads (VCPUs). POWER8 implements a new "msgsndp" instruction which guest kernels can use to interrupt other threads in the same core or sub-core. Since the instruction takes the destination physical thread ID as a parameter, it becomes necessary to align the physical thread IDs with the virtual thread IDs, that is, to make sure virtual thread N within a virtual core always runs on physical thread N. This means that it's possible that thread 0, which is where we call __kvmppc_vcore_entry, may end up running some other vcpu than the one whose task called kvmppc_run_core(), or it may end up running no vcpu at all, if for example thread 0 of the virtual core is currently executing in userspace. However, we do need thread 0 to be responsible for switching the MMU -- a previous version of this patch that had other threads switching the MMU was found to be responsible for occasional memory corruption and machine check interrupts in the guest on POWER7 machines. To accommodate this, we no longer pass the vcpu pointer to __kvmppc_vcore_entry, but instead let the assembly code load it from the PACA. Since the assembly code will need to know the kvm pointer and the thread ID for threads which don't have a vcpu, we move the thread ID into the PACA and we add a kvm pointer to the virtual core structure. In the case where thread 0 has no vcpu to run, it still calls into kvmppc_hv_entry in order to do the MMU switch, and then naps until either its vcpu is ready to run in the guest, or some other thread needs to exit the guest. In the latter case, thread 0 jumps to the code that switches the MMU back to the host. This control flow means that now we switch the MMU before loading any guest vcpu state. Similarly, on guest exit we now save all the guest vcpu state before switching the MMU back to the host. This has required substantial code movement, making the diff rather large. Signed-off-by: Paul Mackerras <paulus@samba.org> Signed-off-by: Alexander Graf <agraf@suse.de>
2173 lines
55 KiB
C
2173 lines
55 KiB
C
/*
|
|
* Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
|
|
* Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
|
|
*
|
|
* Authors:
|
|
* Paul Mackerras <paulus@au1.ibm.com>
|
|
* Alexander Graf <agraf@suse.de>
|
|
* Kevin Wolf <mail@kevin-wolf.de>
|
|
*
|
|
* Description: KVM functions specific to running on Book 3S
|
|
* processors in hypervisor mode (specifically POWER7 and later).
|
|
*
|
|
* This file is derived from arch/powerpc/kvm/book3s.c,
|
|
* by Alexander Graf <agraf@suse.de>.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License, version 2, as
|
|
* published by the Free Software Foundation.
|
|
*/
|
|
|
|
#include <linux/kvm_host.h>
|
|
#include <linux/err.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/preempt.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/export.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/anon_inodes.h>
|
|
#include <linux/cpumask.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/page-flags.h>
|
|
#include <linux/srcu.h>
|
|
#include <linux/miscdevice.h>
|
|
|
|
#include <asm/reg.h>
|
|
#include <asm/cputable.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/uaccess.h>
|
|
#include <asm/io.h>
|
|
#include <asm/kvm_ppc.h>
|
|
#include <asm/kvm_book3s.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/lppaca.h>
|
|
#include <asm/processor.h>
|
|
#include <asm/cputhreads.h>
|
|
#include <asm/page.h>
|
|
#include <asm/hvcall.h>
|
|
#include <asm/switch_to.h>
|
|
#include <asm/smp.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/module.h>
|
|
|
|
#include "book3s.h"
|
|
|
|
/* #define EXIT_DEBUG */
|
|
/* #define EXIT_DEBUG_SIMPLE */
|
|
/* #define EXIT_DEBUG_INT */
|
|
|
|
/* Used to indicate that a guest page fault needs to be handled */
|
|
#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
|
|
|
|
/* Used as a "null" value for timebase values */
|
|
#define TB_NIL (~(u64)0)
|
|
|
|
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
|
|
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
|
|
|
|
static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
|
|
{
|
|
int me;
|
|
int cpu = vcpu->cpu;
|
|
wait_queue_head_t *wqp;
|
|
|
|
wqp = kvm_arch_vcpu_wq(vcpu);
|
|
if (waitqueue_active(wqp)) {
|
|
wake_up_interruptible(wqp);
|
|
++vcpu->stat.halt_wakeup;
|
|
}
|
|
|
|
me = get_cpu();
|
|
|
|
/* CPU points to the first thread of the core */
|
|
if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
|
|
#ifdef CONFIG_KVM_XICS
|
|
int real_cpu = cpu + vcpu->arch.ptid;
|
|
if (paca[real_cpu].kvm_hstate.xics_phys)
|
|
xics_wake_cpu(real_cpu);
|
|
else
|
|
#endif
|
|
if (cpu_online(cpu))
|
|
smp_send_reschedule(cpu);
|
|
}
|
|
put_cpu();
|
|
}
|
|
|
|
/*
|
|
* We use the vcpu_load/put functions to measure stolen time.
|
|
* Stolen time is counted as time when either the vcpu is able to
|
|
* run as part of a virtual core, but the task running the vcore
|
|
* is preempted or sleeping, or when the vcpu needs something done
|
|
* in the kernel by the task running the vcpu, but that task is
|
|
* preempted or sleeping. Those two things have to be counted
|
|
* separately, since one of the vcpu tasks will take on the job
|
|
* of running the core, and the other vcpu tasks in the vcore will
|
|
* sleep waiting for it to do that, but that sleep shouldn't count
|
|
* as stolen time.
|
|
*
|
|
* Hence we accumulate stolen time when the vcpu can run as part of
|
|
* a vcore using vc->stolen_tb, and the stolen time when the vcpu
|
|
* needs its task to do other things in the kernel (for example,
|
|
* service a page fault) in busy_stolen. We don't accumulate
|
|
* stolen time for a vcore when it is inactive, or for a vcpu
|
|
* when it is in state RUNNING or NOTREADY. NOTREADY is a bit of
|
|
* a misnomer; it means that the vcpu task is not executing in
|
|
* the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
|
|
* the kernel. We don't have any way of dividing up that time
|
|
* between time that the vcpu is genuinely stopped, time that
|
|
* the task is actively working on behalf of the vcpu, and time
|
|
* that the task is preempted, so we don't count any of it as
|
|
* stolen.
|
|
*
|
|
* Updates to busy_stolen are protected by arch.tbacct_lock;
|
|
* updates to vc->stolen_tb are protected by the arch.tbacct_lock
|
|
* of the vcpu that has taken responsibility for running the vcore
|
|
* (i.e. vc->runner). The stolen times are measured in units of
|
|
* timebase ticks. (Note that the != TB_NIL checks below are
|
|
* purely defensive; they should never fail.)
|
|
*/
|
|
|
|
static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
|
|
{
|
|
struct kvmppc_vcore *vc = vcpu->arch.vcore;
|
|
|
|
spin_lock(&vcpu->arch.tbacct_lock);
|
|
if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE &&
|
|
vc->preempt_tb != TB_NIL) {
|
|
vc->stolen_tb += mftb() - vc->preempt_tb;
|
|
vc->preempt_tb = TB_NIL;
|
|
}
|
|
if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
|
|
vcpu->arch.busy_preempt != TB_NIL) {
|
|
vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
|
|
vcpu->arch.busy_preempt = TB_NIL;
|
|
}
|
|
spin_unlock(&vcpu->arch.tbacct_lock);
|
|
}
|
|
|
|
static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
|
|
{
|
|
struct kvmppc_vcore *vc = vcpu->arch.vcore;
|
|
|
|
spin_lock(&vcpu->arch.tbacct_lock);
|
|
if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
|
|
vc->preempt_tb = mftb();
|
|
if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
|
|
vcpu->arch.busy_preempt = mftb();
|
|
spin_unlock(&vcpu->arch.tbacct_lock);
|
|
}
|
|
|
|
static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
|
|
{
|
|
vcpu->arch.shregs.msr = msr;
|
|
kvmppc_end_cede(vcpu);
|
|
}
|
|
|
|
void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
|
|
{
|
|
vcpu->arch.pvr = pvr;
|
|
}
|
|
|
|
int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
|
|
{
|
|
unsigned long pcr = 0;
|
|
struct kvmppc_vcore *vc = vcpu->arch.vcore;
|
|
|
|
if (arch_compat) {
|
|
if (!cpu_has_feature(CPU_FTR_ARCH_206))
|
|
return -EINVAL; /* 970 has no compat mode support */
|
|
|
|
switch (arch_compat) {
|
|
case PVR_ARCH_205:
|
|
pcr = PCR_ARCH_205;
|
|
break;
|
|
case PVR_ARCH_206:
|
|
case PVR_ARCH_206p:
|
|
break;
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
spin_lock(&vc->lock);
|
|
vc->arch_compat = arch_compat;
|
|
vc->pcr = pcr;
|
|
spin_unlock(&vc->lock);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
|
|
{
|
|
int r;
|
|
|
|
pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
|
|
pr_err("pc = %.16lx msr = %.16llx trap = %x\n",
|
|
vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
|
|
for (r = 0; r < 16; ++r)
|
|
pr_err("r%2d = %.16lx r%d = %.16lx\n",
|
|
r, kvmppc_get_gpr(vcpu, r),
|
|
r+16, kvmppc_get_gpr(vcpu, r+16));
|
|
pr_err("ctr = %.16lx lr = %.16lx\n",
|
|
vcpu->arch.ctr, vcpu->arch.lr);
|
|
pr_err("srr0 = %.16llx srr1 = %.16llx\n",
|
|
vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
|
|
pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
|
|
vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
|
|
pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
|
|
vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
|
|
pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
|
|
vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
|
|
pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
|
|
pr_err("fault dar = %.16lx dsisr = %.8x\n",
|
|
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
|
|
pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
|
|
for (r = 0; r < vcpu->arch.slb_max; ++r)
|
|
pr_err(" ESID = %.16llx VSID = %.16llx\n",
|
|
vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
|
|
pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
|
|
vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
|
|
vcpu->arch.last_inst);
|
|
}
|
|
|
|
struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
|
|
{
|
|
int r;
|
|
struct kvm_vcpu *v, *ret = NULL;
|
|
|
|
mutex_lock(&kvm->lock);
|
|
kvm_for_each_vcpu(r, v, kvm) {
|
|
if (v->vcpu_id == id) {
|
|
ret = v;
|
|
break;
|
|
}
|
|
}
|
|
mutex_unlock(&kvm->lock);
|
|
return ret;
|
|
}
|
|
|
|
static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
|
|
{
|
|
vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
|
|
vpa->yield_count = 1;
|
|
}
|
|
|
|
static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
|
|
unsigned long addr, unsigned long len)
|
|
{
|
|
/* check address is cacheline aligned */
|
|
if (addr & (L1_CACHE_BYTES - 1))
|
|
return -EINVAL;
|
|
spin_lock(&vcpu->arch.vpa_update_lock);
|
|
if (v->next_gpa != addr || v->len != len) {
|
|
v->next_gpa = addr;
|
|
v->len = addr ? len : 0;
|
|
v->update_pending = 1;
|
|
}
|
|
spin_unlock(&vcpu->arch.vpa_update_lock);
|
|
return 0;
|
|
}
|
|
|
|
/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
|
|
struct reg_vpa {
|
|
u32 dummy;
|
|
union {
|
|
u16 hword;
|
|
u32 word;
|
|
} length;
|
|
};
|
|
|
|
static int vpa_is_registered(struct kvmppc_vpa *vpap)
|
|
{
|
|
if (vpap->update_pending)
|
|
return vpap->next_gpa != 0;
|
|
return vpap->pinned_addr != NULL;
|
|
}
|
|
|
|
static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
|
|
unsigned long flags,
|
|
unsigned long vcpuid, unsigned long vpa)
|
|
{
|
|
struct kvm *kvm = vcpu->kvm;
|
|
unsigned long len, nb;
|
|
void *va;
|
|
struct kvm_vcpu *tvcpu;
|
|
int err;
|
|
int subfunc;
|
|
struct kvmppc_vpa *vpap;
|
|
|
|
tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
|
|
if (!tvcpu)
|
|
return H_PARAMETER;
|
|
|
|
subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
|
|
if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
|
|
subfunc == H_VPA_REG_SLB) {
|
|
/* Registering new area - address must be cache-line aligned */
|
|
if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
|
|
return H_PARAMETER;
|
|
|
|
/* convert logical addr to kernel addr and read length */
|
|
va = kvmppc_pin_guest_page(kvm, vpa, &nb);
|
|
if (va == NULL)
|
|
return H_PARAMETER;
|
|
if (subfunc == H_VPA_REG_VPA)
|
|
len = ((struct reg_vpa *)va)->length.hword;
|
|
else
|
|
len = ((struct reg_vpa *)va)->length.word;
|
|
kvmppc_unpin_guest_page(kvm, va, vpa, false);
|
|
|
|
/* Check length */
|
|
if (len > nb || len < sizeof(struct reg_vpa))
|
|
return H_PARAMETER;
|
|
} else {
|
|
vpa = 0;
|
|
len = 0;
|
|
}
|
|
|
|
err = H_PARAMETER;
|
|
vpap = NULL;
|
|
spin_lock(&tvcpu->arch.vpa_update_lock);
|
|
|
|
switch (subfunc) {
|
|
case H_VPA_REG_VPA: /* register VPA */
|
|
if (len < sizeof(struct lppaca))
|
|
break;
|
|
vpap = &tvcpu->arch.vpa;
|
|
err = 0;
|
|
break;
|
|
|
|
case H_VPA_REG_DTL: /* register DTL */
|
|
if (len < sizeof(struct dtl_entry))
|
|
break;
|
|
len -= len % sizeof(struct dtl_entry);
|
|
|
|
/* Check that they have previously registered a VPA */
|
|
err = H_RESOURCE;
|
|
if (!vpa_is_registered(&tvcpu->arch.vpa))
|
|
break;
|
|
|
|
vpap = &tvcpu->arch.dtl;
|
|
err = 0;
|
|
break;
|
|
|
|
case H_VPA_REG_SLB: /* register SLB shadow buffer */
|
|
/* Check that they have previously registered a VPA */
|
|
err = H_RESOURCE;
|
|
if (!vpa_is_registered(&tvcpu->arch.vpa))
|
|
break;
|
|
|
|
vpap = &tvcpu->arch.slb_shadow;
|
|
err = 0;
|
|
break;
|
|
|
|
case H_VPA_DEREG_VPA: /* deregister VPA */
|
|
/* Check they don't still have a DTL or SLB buf registered */
|
|
err = H_RESOURCE;
|
|
if (vpa_is_registered(&tvcpu->arch.dtl) ||
|
|
vpa_is_registered(&tvcpu->arch.slb_shadow))
|
|
break;
|
|
|
|
vpap = &tvcpu->arch.vpa;
|
|
err = 0;
|
|
break;
|
|
|
|
case H_VPA_DEREG_DTL: /* deregister DTL */
|
|
vpap = &tvcpu->arch.dtl;
|
|
err = 0;
|
|
break;
|
|
|
|
case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */
|
|
vpap = &tvcpu->arch.slb_shadow;
|
|
err = 0;
|
|
break;
|
|
}
|
|
|
|
if (vpap) {
|
|
vpap->next_gpa = vpa;
|
|
vpap->len = len;
|
|
vpap->update_pending = 1;
|
|
}
|
|
|
|
spin_unlock(&tvcpu->arch.vpa_update_lock);
|
|
|
|
return err;
|
|
}
|
|
|
|
static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
|
|
{
|
|
struct kvm *kvm = vcpu->kvm;
|
|
void *va;
|
|
unsigned long nb;
|
|
unsigned long gpa;
|
|
|
|
/*
|
|
* We need to pin the page pointed to by vpap->next_gpa,
|
|
* but we can't call kvmppc_pin_guest_page under the lock
|
|
* as it does get_user_pages() and down_read(). So we
|
|
* have to drop the lock, pin the page, then get the lock
|
|
* again and check that a new area didn't get registered
|
|
* in the meantime.
|
|
*/
|
|
for (;;) {
|
|
gpa = vpap->next_gpa;
|
|
spin_unlock(&vcpu->arch.vpa_update_lock);
|
|
va = NULL;
|
|
nb = 0;
|
|
if (gpa)
|
|
va = kvmppc_pin_guest_page(kvm, gpa, &nb);
|
|
spin_lock(&vcpu->arch.vpa_update_lock);
|
|
if (gpa == vpap->next_gpa)
|
|
break;
|
|
/* sigh... unpin that one and try again */
|
|
if (va)
|
|
kvmppc_unpin_guest_page(kvm, va, gpa, false);
|
|
}
|
|
|
|
vpap->update_pending = 0;
|
|
if (va && nb < vpap->len) {
|
|
/*
|
|
* If it's now too short, it must be that userspace
|
|
* has changed the mappings underlying guest memory,
|
|
* so unregister the region.
|
|
*/
|
|
kvmppc_unpin_guest_page(kvm, va, gpa, false);
|
|
va = NULL;
|
|
}
|
|
if (vpap->pinned_addr)
|
|
kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
|
|
vpap->dirty);
|
|
vpap->gpa = gpa;
|
|
vpap->pinned_addr = va;
|
|
vpap->dirty = false;
|
|
if (va)
|
|
vpap->pinned_end = va + vpap->len;
|
|
}
|
|
|
|
static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
|
|
{
|
|
if (!(vcpu->arch.vpa.update_pending ||
|
|
vcpu->arch.slb_shadow.update_pending ||
|
|
vcpu->arch.dtl.update_pending))
|
|
return;
|
|
|
|
spin_lock(&vcpu->arch.vpa_update_lock);
|
|
if (vcpu->arch.vpa.update_pending) {
|
|
kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
|
|
if (vcpu->arch.vpa.pinned_addr)
|
|
init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
|
|
}
|
|
if (vcpu->arch.dtl.update_pending) {
|
|
kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
|
|
vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
|
|
vcpu->arch.dtl_index = 0;
|
|
}
|
|
if (vcpu->arch.slb_shadow.update_pending)
|
|
kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
|
|
spin_unlock(&vcpu->arch.vpa_update_lock);
|
|
}
|
|
|
|
/*
|
|
* Return the accumulated stolen time for the vcore up until `now'.
|
|
* The caller should hold the vcore lock.
|
|
*/
|
|
static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
|
|
{
|
|
u64 p;
|
|
|
|
/*
|
|
* If we are the task running the vcore, then since we hold
|
|
* the vcore lock, we can't be preempted, so stolen_tb/preempt_tb
|
|
* can't be updated, so we don't need the tbacct_lock.
|
|
* If the vcore is inactive, it can't become active (since we
|
|
* hold the vcore lock), so the vcpu load/put functions won't
|
|
* update stolen_tb/preempt_tb, and we don't need tbacct_lock.
|
|
*/
|
|
if (vc->vcore_state != VCORE_INACTIVE &&
|
|
vc->runner->arch.run_task != current) {
|
|
spin_lock(&vc->runner->arch.tbacct_lock);
|
|
p = vc->stolen_tb;
|
|
if (vc->preempt_tb != TB_NIL)
|
|
p += now - vc->preempt_tb;
|
|
spin_unlock(&vc->runner->arch.tbacct_lock);
|
|
} else {
|
|
p = vc->stolen_tb;
|
|
}
|
|
return p;
|
|
}
|
|
|
|
static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
|
|
struct kvmppc_vcore *vc)
|
|
{
|
|
struct dtl_entry *dt;
|
|
struct lppaca *vpa;
|
|
unsigned long stolen;
|
|
unsigned long core_stolen;
|
|
u64 now;
|
|
|
|
dt = vcpu->arch.dtl_ptr;
|
|
vpa = vcpu->arch.vpa.pinned_addr;
|
|
now = mftb();
|
|
core_stolen = vcore_stolen_time(vc, now);
|
|
stolen = core_stolen - vcpu->arch.stolen_logged;
|
|
vcpu->arch.stolen_logged = core_stolen;
|
|
spin_lock(&vcpu->arch.tbacct_lock);
|
|
stolen += vcpu->arch.busy_stolen;
|
|
vcpu->arch.busy_stolen = 0;
|
|
spin_unlock(&vcpu->arch.tbacct_lock);
|
|
if (!dt || !vpa)
|
|
return;
|
|
memset(dt, 0, sizeof(struct dtl_entry));
|
|
dt->dispatch_reason = 7;
|
|
dt->processor_id = vc->pcpu + vcpu->arch.ptid;
|
|
dt->timebase = now + vc->tb_offset;
|
|
dt->enqueue_to_dispatch_time = stolen;
|
|
dt->srr0 = kvmppc_get_pc(vcpu);
|
|
dt->srr1 = vcpu->arch.shregs.msr;
|
|
++dt;
|
|
if (dt == vcpu->arch.dtl.pinned_end)
|
|
dt = vcpu->arch.dtl.pinned_addr;
|
|
vcpu->arch.dtl_ptr = dt;
|
|
/* order writing *dt vs. writing vpa->dtl_idx */
|
|
smp_wmb();
|
|
vpa->dtl_idx = ++vcpu->arch.dtl_index;
|
|
vcpu->arch.dtl.dirty = true;
|
|
}
|
|
|
|
int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
|
|
{
|
|
unsigned long req = kvmppc_get_gpr(vcpu, 3);
|
|
unsigned long target, ret = H_SUCCESS;
|
|
struct kvm_vcpu *tvcpu;
|
|
int idx, rc;
|
|
|
|
switch (req) {
|
|
case H_ENTER:
|
|
idx = srcu_read_lock(&vcpu->kvm->srcu);
|
|
ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
|
|
kvmppc_get_gpr(vcpu, 5),
|
|
kvmppc_get_gpr(vcpu, 6),
|
|
kvmppc_get_gpr(vcpu, 7));
|
|
srcu_read_unlock(&vcpu->kvm->srcu, idx);
|
|
break;
|
|
case H_CEDE:
|
|
break;
|
|
case H_PROD:
|
|
target = kvmppc_get_gpr(vcpu, 4);
|
|
tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
|
|
if (!tvcpu) {
|
|
ret = H_PARAMETER;
|
|
break;
|
|
}
|
|
tvcpu->arch.prodded = 1;
|
|
smp_mb();
|
|
if (vcpu->arch.ceded) {
|
|
if (waitqueue_active(&vcpu->wq)) {
|
|
wake_up_interruptible(&vcpu->wq);
|
|
vcpu->stat.halt_wakeup++;
|
|
}
|
|
}
|
|
break;
|
|
case H_CONFER:
|
|
target = kvmppc_get_gpr(vcpu, 4);
|
|
if (target == -1)
|
|
break;
|
|
tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
|
|
if (!tvcpu) {
|
|
ret = H_PARAMETER;
|
|
break;
|
|
}
|
|
kvm_vcpu_yield_to(tvcpu);
|
|
break;
|
|
case H_REGISTER_VPA:
|
|
ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
|
|
kvmppc_get_gpr(vcpu, 5),
|
|
kvmppc_get_gpr(vcpu, 6));
|
|
break;
|
|
case H_RTAS:
|
|
if (list_empty(&vcpu->kvm->arch.rtas_tokens))
|
|
return RESUME_HOST;
|
|
|
|
rc = kvmppc_rtas_hcall(vcpu);
|
|
|
|
if (rc == -ENOENT)
|
|
return RESUME_HOST;
|
|
else if (rc == 0)
|
|
break;
|
|
|
|
/* Send the error out to userspace via KVM_RUN */
|
|
return rc;
|
|
|
|
case H_XIRR:
|
|
case H_CPPR:
|
|
case H_EOI:
|
|
case H_IPI:
|
|
case H_IPOLL:
|
|
case H_XIRR_X:
|
|
if (kvmppc_xics_enabled(vcpu)) {
|
|
ret = kvmppc_xics_hcall(vcpu, req);
|
|
break;
|
|
} /* fallthrough */
|
|
default:
|
|
return RESUME_HOST;
|
|
}
|
|
kvmppc_set_gpr(vcpu, 3, ret);
|
|
vcpu->arch.hcall_needed = 0;
|
|
return RESUME_GUEST;
|
|
}
|
|
|
|
static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
|
|
struct task_struct *tsk)
|
|
{
|
|
int r = RESUME_HOST;
|
|
|
|
vcpu->stat.sum_exits++;
|
|
|
|
run->exit_reason = KVM_EXIT_UNKNOWN;
|
|
run->ready_for_interrupt_injection = 1;
|
|
switch (vcpu->arch.trap) {
|
|
/* We're good on these - the host merely wanted to get our attention */
|
|
case BOOK3S_INTERRUPT_HV_DECREMENTER:
|
|
vcpu->stat.dec_exits++;
|
|
r = RESUME_GUEST;
|
|
break;
|
|
case BOOK3S_INTERRUPT_EXTERNAL:
|
|
vcpu->stat.ext_intr_exits++;
|
|
r = RESUME_GUEST;
|
|
break;
|
|
case BOOK3S_INTERRUPT_PERFMON:
|
|
r = RESUME_GUEST;
|
|
break;
|
|
case BOOK3S_INTERRUPT_MACHINE_CHECK:
|
|
/*
|
|
* Deliver a machine check interrupt to the guest.
|
|
* We have to do this, even if the host has handled the
|
|
* machine check, because machine checks use SRR0/1 and
|
|
* the interrupt might have trashed guest state in them.
|
|
*/
|
|
kvmppc_book3s_queue_irqprio(vcpu,
|
|
BOOK3S_INTERRUPT_MACHINE_CHECK);
|
|
r = RESUME_GUEST;
|
|
break;
|
|
case BOOK3S_INTERRUPT_PROGRAM:
|
|
{
|
|
ulong flags;
|
|
/*
|
|
* Normally program interrupts are delivered directly
|
|
* to the guest by the hardware, but we can get here
|
|
* as a result of a hypervisor emulation interrupt
|
|
* (e40) getting turned into a 700 by BML RTAS.
|
|
*/
|
|
flags = vcpu->arch.shregs.msr & 0x1f0000ull;
|
|
kvmppc_core_queue_program(vcpu, flags);
|
|
r = RESUME_GUEST;
|
|
break;
|
|
}
|
|
case BOOK3S_INTERRUPT_SYSCALL:
|
|
{
|
|
/* hcall - punt to userspace */
|
|
int i;
|
|
|
|
/* hypercall with MSR_PR has already been handled in rmode,
|
|
* and never reaches here.
|
|
*/
|
|
|
|
run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
|
|
for (i = 0; i < 9; ++i)
|
|
run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
|
|
run->exit_reason = KVM_EXIT_PAPR_HCALL;
|
|
vcpu->arch.hcall_needed = 1;
|
|
r = RESUME_HOST;
|
|
break;
|
|
}
|
|
/*
|
|
* We get these next two if the guest accesses a page which it thinks
|
|
* it has mapped but which is not actually present, either because
|
|
* it is for an emulated I/O device or because the corresonding
|
|
* host page has been paged out. Any other HDSI/HISI interrupts
|
|
* have been handled already.
|
|
*/
|
|
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
|
|
r = RESUME_PAGE_FAULT;
|
|
break;
|
|
case BOOK3S_INTERRUPT_H_INST_STORAGE:
|
|
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
|
|
vcpu->arch.fault_dsisr = 0;
|
|
r = RESUME_PAGE_FAULT;
|
|
break;
|
|
/*
|
|
* This occurs if the guest executes an illegal instruction.
|
|
* We just generate a program interrupt to the guest, since
|
|
* we don't emulate any guest instructions at this stage.
|
|
*/
|
|
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
|
|
kvmppc_core_queue_program(vcpu, 0x80000);
|
|
r = RESUME_GUEST;
|
|
break;
|
|
default:
|
|
kvmppc_dump_regs(vcpu);
|
|
printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
|
|
vcpu->arch.trap, kvmppc_get_pc(vcpu),
|
|
vcpu->arch.shregs.msr);
|
|
run->hw.hardware_exit_reason = vcpu->arch.trap;
|
|
r = RESUME_HOST;
|
|
break;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
|
|
struct kvm_sregs *sregs)
|
|
{
|
|
int i;
|
|
|
|
memset(sregs, 0, sizeof(struct kvm_sregs));
|
|
sregs->pvr = vcpu->arch.pvr;
|
|
for (i = 0; i < vcpu->arch.slb_max; i++) {
|
|
sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
|
|
sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
|
|
struct kvm_sregs *sregs)
|
|
{
|
|
int i, j;
|
|
|
|
kvmppc_set_pvr_hv(vcpu, sregs->pvr);
|
|
|
|
j = 0;
|
|
for (i = 0; i < vcpu->arch.slb_nr; i++) {
|
|
if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
|
|
vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
|
|
vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
|
|
++j;
|
|
}
|
|
}
|
|
vcpu->arch.slb_max = j;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr)
|
|
{
|
|
struct kvmppc_vcore *vc = vcpu->arch.vcore;
|
|
u64 mask;
|
|
|
|
spin_lock(&vc->lock);
|
|
/*
|
|
* Userspace can only modify DPFD (default prefetch depth),
|
|
* ILE (interrupt little-endian) and TC (translation control).
|
|
*/
|
|
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
|
|
vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
|
|
spin_unlock(&vc->lock);
|
|
}
|
|
|
|
static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
|
|
union kvmppc_one_reg *val)
|
|
{
|
|
int r = 0;
|
|
long int i;
|
|
|
|
switch (id) {
|
|
case KVM_REG_PPC_HIOR:
|
|
*val = get_reg_val(id, 0);
|
|
break;
|
|
case KVM_REG_PPC_DABR:
|
|
*val = get_reg_val(id, vcpu->arch.dabr);
|
|
break;
|
|
case KVM_REG_PPC_DSCR:
|
|
*val = get_reg_val(id, vcpu->arch.dscr);
|
|
break;
|
|
case KVM_REG_PPC_PURR:
|
|
*val = get_reg_val(id, vcpu->arch.purr);
|
|
break;
|
|
case KVM_REG_PPC_SPURR:
|
|
*val = get_reg_val(id, vcpu->arch.spurr);
|
|
break;
|
|
case KVM_REG_PPC_AMR:
|
|
*val = get_reg_val(id, vcpu->arch.amr);
|
|
break;
|
|
case KVM_REG_PPC_UAMOR:
|
|
*val = get_reg_val(id, vcpu->arch.uamor);
|
|
break;
|
|
case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
|
|
i = id - KVM_REG_PPC_MMCR0;
|
|
*val = get_reg_val(id, vcpu->arch.mmcr[i]);
|
|
break;
|
|
case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
|
|
i = id - KVM_REG_PPC_PMC1;
|
|
*val = get_reg_val(id, vcpu->arch.pmc[i]);
|
|
break;
|
|
case KVM_REG_PPC_SIAR:
|
|
*val = get_reg_val(id, vcpu->arch.siar);
|
|
break;
|
|
case KVM_REG_PPC_SDAR:
|
|
*val = get_reg_val(id, vcpu->arch.sdar);
|
|
break;
|
|
case KVM_REG_PPC_VPA_ADDR:
|
|
spin_lock(&vcpu->arch.vpa_update_lock);
|
|
*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
|
|
spin_unlock(&vcpu->arch.vpa_update_lock);
|
|
break;
|
|
case KVM_REG_PPC_VPA_SLB:
|
|
spin_lock(&vcpu->arch.vpa_update_lock);
|
|
val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
|
|
val->vpaval.length = vcpu->arch.slb_shadow.len;
|
|
spin_unlock(&vcpu->arch.vpa_update_lock);
|
|
break;
|
|
case KVM_REG_PPC_VPA_DTL:
|
|
spin_lock(&vcpu->arch.vpa_update_lock);
|
|
val->vpaval.addr = vcpu->arch.dtl.next_gpa;
|
|
val->vpaval.length = vcpu->arch.dtl.len;
|
|
spin_unlock(&vcpu->arch.vpa_update_lock);
|
|
break;
|
|
case KVM_REG_PPC_TB_OFFSET:
|
|
*val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
|
|
break;
|
|
case KVM_REG_PPC_LPCR:
|
|
*val = get_reg_val(id, vcpu->arch.vcore->lpcr);
|
|
break;
|
|
case KVM_REG_PPC_PPR:
|
|
*val = get_reg_val(id, vcpu->arch.ppr);
|
|
break;
|
|
case KVM_REG_PPC_ARCH_COMPAT:
|
|
*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
|
|
break;
|
|
default:
|
|
r = -EINVAL;
|
|
break;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
|
|
union kvmppc_one_reg *val)
|
|
{
|
|
int r = 0;
|
|
long int i;
|
|
unsigned long addr, len;
|
|
|
|
switch (id) {
|
|
case KVM_REG_PPC_HIOR:
|
|
/* Only allow this to be set to zero */
|
|
if (set_reg_val(id, *val))
|
|
r = -EINVAL;
|
|
break;
|
|
case KVM_REG_PPC_DABR:
|
|
vcpu->arch.dabr = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_DSCR:
|
|
vcpu->arch.dscr = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_PURR:
|
|
vcpu->arch.purr = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_SPURR:
|
|
vcpu->arch.spurr = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_AMR:
|
|
vcpu->arch.amr = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_UAMOR:
|
|
vcpu->arch.uamor = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
|
|
i = id - KVM_REG_PPC_MMCR0;
|
|
vcpu->arch.mmcr[i] = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
|
|
i = id - KVM_REG_PPC_PMC1;
|
|
vcpu->arch.pmc[i] = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_SIAR:
|
|
vcpu->arch.siar = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_SDAR:
|
|
vcpu->arch.sdar = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_VPA_ADDR:
|
|
addr = set_reg_val(id, *val);
|
|
r = -EINVAL;
|
|
if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
|
|
vcpu->arch.dtl.next_gpa))
|
|
break;
|
|
r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
|
|
break;
|
|
case KVM_REG_PPC_VPA_SLB:
|
|
addr = val->vpaval.addr;
|
|
len = val->vpaval.length;
|
|
r = -EINVAL;
|
|
if (addr && !vcpu->arch.vpa.next_gpa)
|
|
break;
|
|
r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
|
|
break;
|
|
case KVM_REG_PPC_VPA_DTL:
|
|
addr = val->vpaval.addr;
|
|
len = val->vpaval.length;
|
|
r = -EINVAL;
|
|
if (addr && (len < sizeof(struct dtl_entry) ||
|
|
!vcpu->arch.vpa.next_gpa))
|
|
break;
|
|
len -= len % sizeof(struct dtl_entry);
|
|
r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
|
|
break;
|
|
case KVM_REG_PPC_TB_OFFSET:
|
|
/* round up to multiple of 2^24 */
|
|
vcpu->arch.vcore->tb_offset =
|
|
ALIGN(set_reg_val(id, *val), 1UL << 24);
|
|
break;
|
|
case KVM_REG_PPC_LPCR:
|
|
kvmppc_set_lpcr(vcpu, set_reg_val(id, *val));
|
|
break;
|
|
case KVM_REG_PPC_PPR:
|
|
vcpu->arch.ppr = set_reg_val(id, *val);
|
|
break;
|
|
case KVM_REG_PPC_ARCH_COMPAT:
|
|
r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
|
|
break;
|
|
default:
|
|
r = -EINVAL;
|
|
break;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
|
|
unsigned int id)
|
|
{
|
|
struct kvm_vcpu *vcpu;
|
|
int err = -EINVAL;
|
|
int core;
|
|
struct kvmppc_vcore *vcore;
|
|
|
|
core = id / threads_per_core;
|
|
if (core >= KVM_MAX_VCORES)
|
|
goto out;
|
|
|
|
err = -ENOMEM;
|
|
vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
|
|
if (!vcpu)
|
|
goto out;
|
|
|
|
err = kvm_vcpu_init(vcpu, kvm, id);
|
|
if (err)
|
|
goto free_vcpu;
|
|
|
|
vcpu->arch.shared = &vcpu->arch.shregs;
|
|
vcpu->arch.mmcr[0] = MMCR0_FC;
|
|
vcpu->arch.ctrl = CTRL_RUNLATCH;
|
|
/* default to host PVR, since we can't spoof it */
|
|
kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
|
|
spin_lock_init(&vcpu->arch.vpa_update_lock);
|
|
spin_lock_init(&vcpu->arch.tbacct_lock);
|
|
vcpu->arch.busy_preempt = TB_NIL;
|
|
|
|
kvmppc_mmu_book3s_hv_init(vcpu);
|
|
|
|
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
|
|
|
|
init_waitqueue_head(&vcpu->arch.cpu_run);
|
|
|
|
mutex_lock(&kvm->lock);
|
|
vcore = kvm->arch.vcores[core];
|
|
if (!vcore) {
|
|
vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
|
|
if (vcore) {
|
|
INIT_LIST_HEAD(&vcore->runnable_threads);
|
|
spin_lock_init(&vcore->lock);
|
|
init_waitqueue_head(&vcore->wq);
|
|
vcore->preempt_tb = TB_NIL;
|
|
vcore->lpcr = kvm->arch.lpcr;
|
|
vcore->first_vcpuid = core * threads_per_core;
|
|
vcore->kvm = kvm;
|
|
}
|
|
kvm->arch.vcores[core] = vcore;
|
|
kvm->arch.online_vcores++;
|
|
}
|
|
mutex_unlock(&kvm->lock);
|
|
|
|
if (!vcore)
|
|
goto free_vcpu;
|
|
|
|
spin_lock(&vcore->lock);
|
|
++vcore->num_threads;
|
|
spin_unlock(&vcore->lock);
|
|
vcpu->arch.vcore = vcore;
|
|
vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
|
|
|
|
vcpu->arch.cpu_type = KVM_CPU_3S_64;
|
|
kvmppc_sanity_check(vcpu);
|
|
|
|
return vcpu;
|
|
|
|
free_vcpu:
|
|
kmem_cache_free(kvm_vcpu_cache, vcpu);
|
|
out:
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
|
|
{
|
|
if (vpa->pinned_addr)
|
|
kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
|
|
vpa->dirty);
|
|
}
|
|
|
|
static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
|
|
{
|
|
spin_lock(&vcpu->arch.vpa_update_lock);
|
|
unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
|
|
unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
|
|
unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
|
|
spin_unlock(&vcpu->arch.vpa_update_lock);
|
|
kvm_vcpu_uninit(vcpu);
|
|
kmem_cache_free(kvm_vcpu_cache, vcpu);
|
|
}
|
|
|
|
static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
|
|
{
|
|
/* Indicate we want to get back into the guest */
|
|
return 1;
|
|
}
|
|
|
|
static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
|
|
{
|
|
unsigned long dec_nsec, now;
|
|
|
|
now = get_tb();
|
|
if (now > vcpu->arch.dec_expires) {
|
|
/* decrementer has already gone negative */
|
|
kvmppc_core_queue_dec(vcpu);
|
|
kvmppc_core_prepare_to_enter(vcpu);
|
|
return;
|
|
}
|
|
dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
|
|
/ tb_ticks_per_sec;
|
|
hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
|
|
HRTIMER_MODE_REL);
|
|
vcpu->arch.timer_running = 1;
|
|
}
|
|
|
|
static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
|
|
{
|
|
vcpu->arch.ceded = 0;
|
|
if (vcpu->arch.timer_running) {
|
|
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
|
|
vcpu->arch.timer_running = 0;
|
|
}
|
|
}
|
|
|
|
extern void __kvmppc_vcore_entry(void);
|
|
|
|
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
|
|
struct kvm_vcpu *vcpu)
|
|
{
|
|
u64 now;
|
|
|
|
if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
|
|
return;
|
|
spin_lock(&vcpu->arch.tbacct_lock);
|
|
now = mftb();
|
|
vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
|
|
vcpu->arch.stolen_logged;
|
|
vcpu->arch.busy_preempt = now;
|
|
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
|
|
spin_unlock(&vcpu->arch.tbacct_lock);
|
|
--vc->n_runnable;
|
|
list_del(&vcpu->arch.run_list);
|
|
}
|
|
|
|
static int kvmppc_grab_hwthread(int cpu)
|
|
{
|
|
struct paca_struct *tpaca;
|
|
long timeout = 1000;
|
|
|
|
tpaca = &paca[cpu];
|
|
|
|
/* Ensure the thread won't go into the kernel if it wakes */
|
|
tpaca->kvm_hstate.hwthread_req = 1;
|
|
tpaca->kvm_hstate.kvm_vcpu = NULL;
|
|
|
|
/*
|
|
* If the thread is already executing in the kernel (e.g. handling
|
|
* a stray interrupt), wait for it to get back to nap mode.
|
|
* The smp_mb() is to ensure that our setting of hwthread_req
|
|
* is visible before we look at hwthread_state, so if this
|
|
* races with the code at system_reset_pSeries and the thread
|
|
* misses our setting of hwthread_req, we are sure to see its
|
|
* setting of hwthread_state, and vice versa.
|
|
*/
|
|
smp_mb();
|
|
while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
|
|
if (--timeout <= 0) {
|
|
pr_err("KVM: couldn't grab cpu %d\n", cpu);
|
|
return -EBUSY;
|
|
}
|
|
udelay(1);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void kvmppc_release_hwthread(int cpu)
|
|
{
|
|
struct paca_struct *tpaca;
|
|
|
|
tpaca = &paca[cpu];
|
|
tpaca->kvm_hstate.hwthread_req = 0;
|
|
tpaca->kvm_hstate.kvm_vcpu = NULL;
|
|
}
|
|
|
|
static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
|
|
{
|
|
int cpu;
|
|
struct paca_struct *tpaca;
|
|
struct kvmppc_vcore *vc = vcpu->arch.vcore;
|
|
|
|
if (vcpu->arch.timer_running) {
|
|
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
|
|
vcpu->arch.timer_running = 0;
|
|
}
|
|
cpu = vc->pcpu + vcpu->arch.ptid;
|
|
tpaca = &paca[cpu];
|
|
tpaca->kvm_hstate.kvm_vcpu = vcpu;
|
|
tpaca->kvm_hstate.kvm_vcore = vc;
|
|
tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
|
|
vcpu->cpu = vc->pcpu;
|
|
smp_wmb();
|
|
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
|
|
if (cpu != smp_processor_id()) {
|
|
#ifdef CONFIG_KVM_XICS
|
|
xics_wake_cpu(cpu);
|
|
#endif
|
|
if (vcpu->arch.ptid)
|
|
++vc->n_woken;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
|
|
{
|
|
int i;
|
|
|
|
HMT_low();
|
|
i = 0;
|
|
while (vc->nap_count < vc->n_woken) {
|
|
if (++i >= 1000000) {
|
|
pr_err("kvmppc_wait_for_nap timeout %d %d\n",
|
|
vc->nap_count, vc->n_woken);
|
|
break;
|
|
}
|
|
cpu_relax();
|
|
}
|
|
HMT_medium();
|
|
}
|
|
|
|
/*
|
|
* Check that we are on thread 0 and that any other threads in
|
|
* this core are off-line. Then grab the threads so they can't
|
|
* enter the kernel.
|
|
*/
|
|
static int on_primary_thread(void)
|
|
{
|
|
int cpu = smp_processor_id();
|
|
int thr = cpu_thread_in_core(cpu);
|
|
|
|
if (thr)
|
|
return 0;
|
|
while (++thr < threads_per_core)
|
|
if (cpu_online(cpu + thr))
|
|
return 0;
|
|
|
|
/* Grab all hw threads so they can't go into the kernel */
|
|
for (thr = 1; thr < threads_per_core; ++thr) {
|
|
if (kvmppc_grab_hwthread(cpu + thr)) {
|
|
/* Couldn't grab one; let the others go */
|
|
do {
|
|
kvmppc_release_hwthread(cpu + thr);
|
|
} while (--thr > 0);
|
|
return 0;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Run a set of guest threads on a physical core.
|
|
* Called with vc->lock held.
|
|
*/
|
|
static void kvmppc_run_core(struct kvmppc_vcore *vc)
|
|
{
|
|
struct kvm_vcpu *vcpu, *vnext;
|
|
long ret;
|
|
u64 now;
|
|
int i, need_vpa_update;
|
|
int srcu_idx;
|
|
struct kvm_vcpu *vcpus_to_update[threads_per_core];
|
|
|
|
/* don't start if any threads have a signal pending */
|
|
need_vpa_update = 0;
|
|
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
|
|
if (signal_pending(vcpu->arch.run_task))
|
|
return;
|
|
if (vcpu->arch.vpa.update_pending ||
|
|
vcpu->arch.slb_shadow.update_pending ||
|
|
vcpu->arch.dtl.update_pending)
|
|
vcpus_to_update[need_vpa_update++] = vcpu;
|
|
}
|
|
|
|
/*
|
|
* Initialize *vc, in particular vc->vcore_state, so we can
|
|
* drop the vcore lock if necessary.
|
|
*/
|
|
vc->n_woken = 0;
|
|
vc->nap_count = 0;
|
|
vc->entry_exit_count = 0;
|
|
vc->vcore_state = VCORE_STARTING;
|
|
vc->in_guest = 0;
|
|
vc->napping_threads = 0;
|
|
|
|
/*
|
|
* Updating any of the vpas requires calling kvmppc_pin_guest_page,
|
|
* which can't be called with any spinlocks held.
|
|
*/
|
|
if (need_vpa_update) {
|
|
spin_unlock(&vc->lock);
|
|
for (i = 0; i < need_vpa_update; ++i)
|
|
kvmppc_update_vpas(vcpus_to_update[i]);
|
|
spin_lock(&vc->lock);
|
|
}
|
|
|
|
/*
|
|
* Make sure we are running on thread 0, and that
|
|
* secondary threads are offline.
|
|
*/
|
|
if (threads_per_core > 1 && !on_primary_thread()) {
|
|
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
|
|
vcpu->arch.ret = -EBUSY;
|
|
goto out;
|
|
}
|
|
|
|
vc->pcpu = smp_processor_id();
|
|
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
|
|
kvmppc_start_thread(vcpu);
|
|
kvmppc_create_dtl_entry(vcpu, vc);
|
|
}
|
|
|
|
/* Set this explicitly in case thread 0 doesn't have a vcpu */
|
|
get_paca()->kvm_hstate.kvm_vcore = vc;
|
|
get_paca()->kvm_hstate.ptid = 0;
|
|
|
|
vc->vcore_state = VCORE_RUNNING;
|
|
preempt_disable();
|
|
spin_unlock(&vc->lock);
|
|
|
|
kvm_guest_enter();
|
|
|
|
srcu_idx = srcu_read_lock(&vc->kvm->srcu);
|
|
|
|
__kvmppc_vcore_entry();
|
|
|
|
spin_lock(&vc->lock);
|
|
/* disable sending of IPIs on virtual external irqs */
|
|
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
|
|
vcpu->cpu = -1;
|
|
/* wait for secondary threads to finish writing their state to memory */
|
|
if (vc->nap_count < vc->n_woken)
|
|
kvmppc_wait_for_nap(vc);
|
|
for (i = 0; i < threads_per_core; ++i)
|
|
kvmppc_release_hwthread(vc->pcpu + i);
|
|
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
|
|
vc->vcore_state = VCORE_EXITING;
|
|
spin_unlock(&vc->lock);
|
|
|
|
srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
|
|
|
|
/* make sure updates to secondary vcpu structs are visible now */
|
|
smp_mb();
|
|
kvm_guest_exit();
|
|
|
|
preempt_enable();
|
|
kvm_resched(vcpu);
|
|
|
|
spin_lock(&vc->lock);
|
|
now = get_tb();
|
|
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
|
|
/* cancel pending dec exception if dec is positive */
|
|
if (now < vcpu->arch.dec_expires &&
|
|
kvmppc_core_pending_dec(vcpu))
|
|
kvmppc_core_dequeue_dec(vcpu);
|
|
|
|
ret = RESUME_GUEST;
|
|
if (vcpu->arch.trap)
|
|
ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
|
|
vcpu->arch.run_task);
|
|
|
|
vcpu->arch.ret = ret;
|
|
vcpu->arch.trap = 0;
|
|
|
|
if (vcpu->arch.ceded) {
|
|
if (ret != RESUME_GUEST)
|
|
kvmppc_end_cede(vcpu);
|
|
else
|
|
kvmppc_set_timer(vcpu);
|
|
}
|
|
}
|
|
|
|
out:
|
|
vc->vcore_state = VCORE_INACTIVE;
|
|
list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
|
|
arch.run_list) {
|
|
if (vcpu->arch.ret != RESUME_GUEST) {
|
|
kvmppc_remove_runnable(vc, vcpu);
|
|
wake_up(&vcpu->arch.cpu_run);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Wait for some other vcpu thread to execute us, and
|
|
* wake us up when we need to handle something in the host.
|
|
*/
|
|
static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
|
|
{
|
|
DEFINE_WAIT(wait);
|
|
|
|
prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
|
|
if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
|
|
schedule();
|
|
finish_wait(&vcpu->arch.cpu_run, &wait);
|
|
}
|
|
|
|
/*
|
|
* All the vcpus in this vcore are idle, so wait for a decrementer
|
|
* or external interrupt to one of the vcpus. vc->lock is held.
|
|
*/
|
|
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
|
|
{
|
|
DEFINE_WAIT(wait);
|
|
|
|
prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
|
|
vc->vcore_state = VCORE_SLEEPING;
|
|
spin_unlock(&vc->lock);
|
|
schedule();
|
|
finish_wait(&vc->wq, &wait);
|
|
spin_lock(&vc->lock);
|
|
vc->vcore_state = VCORE_INACTIVE;
|
|
}
|
|
|
|
static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
|
|
{
|
|
int n_ceded;
|
|
struct kvmppc_vcore *vc;
|
|
struct kvm_vcpu *v, *vn;
|
|
|
|
kvm_run->exit_reason = 0;
|
|
vcpu->arch.ret = RESUME_GUEST;
|
|
vcpu->arch.trap = 0;
|
|
kvmppc_update_vpas(vcpu);
|
|
|
|
/*
|
|
* Synchronize with other threads in this virtual core
|
|
*/
|
|
vc = vcpu->arch.vcore;
|
|
spin_lock(&vc->lock);
|
|
vcpu->arch.ceded = 0;
|
|
vcpu->arch.run_task = current;
|
|
vcpu->arch.kvm_run = kvm_run;
|
|
vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
|
|
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
|
|
vcpu->arch.busy_preempt = TB_NIL;
|
|
list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
|
|
++vc->n_runnable;
|
|
|
|
/*
|
|
* This happens the first time this is called for a vcpu.
|
|
* If the vcore is already running, we may be able to start
|
|
* this thread straight away and have it join in.
|
|
*/
|
|
if (!signal_pending(current)) {
|
|
if (vc->vcore_state == VCORE_RUNNING &&
|
|
VCORE_EXIT_COUNT(vc) == 0) {
|
|
kvmppc_create_dtl_entry(vcpu, vc);
|
|
kvmppc_start_thread(vcpu);
|
|
} else if (vc->vcore_state == VCORE_SLEEPING) {
|
|
wake_up(&vc->wq);
|
|
}
|
|
|
|
}
|
|
|
|
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
|
|
!signal_pending(current)) {
|
|
if (vc->vcore_state != VCORE_INACTIVE) {
|
|
spin_unlock(&vc->lock);
|
|
kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
|
|
spin_lock(&vc->lock);
|
|
continue;
|
|
}
|
|
list_for_each_entry_safe(v, vn, &vc->runnable_threads,
|
|
arch.run_list) {
|
|
kvmppc_core_prepare_to_enter(v);
|
|
if (signal_pending(v->arch.run_task)) {
|
|
kvmppc_remove_runnable(vc, v);
|
|
v->stat.signal_exits++;
|
|
v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
|
|
v->arch.ret = -EINTR;
|
|
wake_up(&v->arch.cpu_run);
|
|
}
|
|
}
|
|
if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
|
|
break;
|
|
vc->runner = vcpu;
|
|
n_ceded = 0;
|
|
list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
|
|
if (!v->arch.pending_exceptions)
|
|
n_ceded += v->arch.ceded;
|
|
else
|
|
v->arch.ceded = 0;
|
|
}
|
|
if (n_ceded == vc->n_runnable)
|
|
kvmppc_vcore_blocked(vc);
|
|
else
|
|
kvmppc_run_core(vc);
|
|
vc->runner = NULL;
|
|
}
|
|
|
|
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
|
|
(vc->vcore_state == VCORE_RUNNING ||
|
|
vc->vcore_state == VCORE_EXITING)) {
|
|
spin_unlock(&vc->lock);
|
|
kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
|
|
spin_lock(&vc->lock);
|
|
}
|
|
|
|
if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
|
|
kvmppc_remove_runnable(vc, vcpu);
|
|
vcpu->stat.signal_exits++;
|
|
kvm_run->exit_reason = KVM_EXIT_INTR;
|
|
vcpu->arch.ret = -EINTR;
|
|
}
|
|
|
|
if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
|
|
/* Wake up some vcpu to run the core */
|
|
v = list_first_entry(&vc->runnable_threads,
|
|
struct kvm_vcpu, arch.run_list);
|
|
wake_up(&v->arch.cpu_run);
|
|
}
|
|
|
|
spin_unlock(&vc->lock);
|
|
return vcpu->arch.ret;
|
|
}
|
|
|
|
static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
|
|
{
|
|
int r;
|
|
int srcu_idx;
|
|
|
|
if (!vcpu->arch.sane) {
|
|
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
|
|
return -EINVAL;
|
|
}
|
|
|
|
kvmppc_core_prepare_to_enter(vcpu);
|
|
|
|
/* No need to go into the guest when all we'll do is come back out */
|
|
if (signal_pending(current)) {
|
|
run->exit_reason = KVM_EXIT_INTR;
|
|
return -EINTR;
|
|
}
|
|
|
|
atomic_inc(&vcpu->kvm->arch.vcpus_running);
|
|
/* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
|
|
smp_mb();
|
|
|
|
/* On the first time here, set up HTAB and VRMA or RMA */
|
|
if (!vcpu->kvm->arch.rma_setup_done) {
|
|
r = kvmppc_hv_setup_htab_rma(vcpu);
|
|
if (r)
|
|
goto out;
|
|
}
|
|
|
|
flush_fp_to_thread(current);
|
|
flush_altivec_to_thread(current);
|
|
flush_vsx_to_thread(current);
|
|
vcpu->arch.wqp = &vcpu->arch.vcore->wq;
|
|
vcpu->arch.pgdir = current->mm->pgd;
|
|
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
|
|
|
|
do {
|
|
r = kvmppc_run_vcpu(run, vcpu);
|
|
|
|
if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
|
|
!(vcpu->arch.shregs.msr & MSR_PR)) {
|
|
r = kvmppc_pseries_do_hcall(vcpu);
|
|
kvmppc_core_prepare_to_enter(vcpu);
|
|
} else if (r == RESUME_PAGE_FAULT) {
|
|
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
|
|
r = kvmppc_book3s_hv_page_fault(run, vcpu,
|
|
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
|
|
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
|
|
}
|
|
} while (r == RESUME_GUEST);
|
|
|
|
out:
|
|
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
|
|
atomic_dec(&vcpu->kvm->arch.vcpus_running);
|
|
return r;
|
|
}
|
|
|
|
|
|
/* Work out RMLS (real mode limit selector) field value for a given RMA size.
|
|
Assumes POWER7 or PPC970. */
|
|
static inline int lpcr_rmls(unsigned long rma_size)
|
|
{
|
|
switch (rma_size) {
|
|
case 32ul << 20: /* 32 MB */
|
|
if (cpu_has_feature(CPU_FTR_ARCH_206))
|
|
return 8; /* only supported on POWER7 */
|
|
return -1;
|
|
case 64ul << 20: /* 64 MB */
|
|
return 3;
|
|
case 128ul << 20: /* 128 MB */
|
|
return 7;
|
|
case 256ul << 20: /* 256 MB */
|
|
return 4;
|
|
case 1ul << 30: /* 1 GB */
|
|
return 2;
|
|
case 16ul << 30: /* 16 GB */
|
|
return 1;
|
|
case 256ul << 30: /* 256 GB */
|
|
return 0;
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|
{
|
|
struct page *page;
|
|
struct kvm_rma_info *ri = vma->vm_file->private_data;
|
|
|
|
if (vmf->pgoff >= kvm_rma_pages)
|
|
return VM_FAULT_SIGBUS;
|
|
|
|
page = pfn_to_page(ri->base_pfn + vmf->pgoff);
|
|
get_page(page);
|
|
vmf->page = page;
|
|
return 0;
|
|
}
|
|
|
|
static const struct vm_operations_struct kvm_rma_vm_ops = {
|
|
.fault = kvm_rma_fault,
|
|
};
|
|
|
|
static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma)
|
|
{
|
|
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
|
|
vma->vm_ops = &kvm_rma_vm_ops;
|
|
return 0;
|
|
}
|
|
|
|
static int kvm_rma_release(struct inode *inode, struct file *filp)
|
|
{
|
|
struct kvm_rma_info *ri = filp->private_data;
|
|
|
|
kvm_release_rma(ri);
|
|
return 0;
|
|
}
|
|
|
|
static const struct file_operations kvm_rma_fops = {
|
|
.mmap = kvm_rma_mmap,
|
|
.release = kvm_rma_release,
|
|
};
|
|
|
|
static long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
|
|
struct kvm_allocate_rma *ret)
|
|
{
|
|
long fd;
|
|
struct kvm_rma_info *ri;
|
|
/*
|
|
* Only do this on PPC970 in HV mode
|
|
*/
|
|
if (!cpu_has_feature(CPU_FTR_HVMODE) ||
|
|
!cpu_has_feature(CPU_FTR_ARCH_201))
|
|
return -EINVAL;
|
|
|
|
if (!kvm_rma_pages)
|
|
return -EINVAL;
|
|
|
|
ri = kvm_alloc_rma();
|
|
if (!ri)
|
|
return -ENOMEM;
|
|
|
|
fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR | O_CLOEXEC);
|
|
if (fd < 0)
|
|
kvm_release_rma(ri);
|
|
|
|
ret->rma_size = kvm_rma_pages << PAGE_SHIFT;
|
|
return fd;
|
|
}
|
|
|
|
static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
|
|
int linux_psize)
|
|
{
|
|
struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
|
|
|
|
if (!def->shift)
|
|
return;
|
|
(*sps)->page_shift = def->shift;
|
|
(*sps)->slb_enc = def->sllp;
|
|
(*sps)->enc[0].page_shift = def->shift;
|
|
/*
|
|
* Only return base page encoding. We don't want to return
|
|
* all the supporting pte_enc, because our H_ENTER doesn't
|
|
* support MPSS yet. Once they do, we can start passing all
|
|
* support pte_enc here
|
|
*/
|
|
(*sps)->enc[0].pte_enc = def->penc[linux_psize];
|
|
(*sps)++;
|
|
}
|
|
|
|
static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
|
|
struct kvm_ppc_smmu_info *info)
|
|
{
|
|
struct kvm_ppc_one_seg_page_size *sps;
|
|
|
|
info->flags = KVM_PPC_PAGE_SIZES_REAL;
|
|
if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
|
|
info->flags |= KVM_PPC_1T_SEGMENTS;
|
|
info->slb_size = mmu_slb_size;
|
|
|
|
/* We only support these sizes for now, and no muti-size segments */
|
|
sps = &info->sps[0];
|
|
kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
|
|
kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
|
|
kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Get (and clear) the dirty memory log for a memory slot.
|
|
*/
|
|
static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
|
|
struct kvm_dirty_log *log)
|
|
{
|
|
struct kvm_memory_slot *memslot;
|
|
int r;
|
|
unsigned long n;
|
|
|
|
mutex_lock(&kvm->slots_lock);
|
|
|
|
r = -EINVAL;
|
|
if (log->slot >= KVM_USER_MEM_SLOTS)
|
|
goto out;
|
|
|
|
memslot = id_to_memslot(kvm->memslots, log->slot);
|
|
r = -ENOENT;
|
|
if (!memslot->dirty_bitmap)
|
|
goto out;
|
|
|
|
n = kvm_dirty_bitmap_bytes(memslot);
|
|
memset(memslot->dirty_bitmap, 0, n);
|
|
|
|
r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
|
|
if (r)
|
|
goto out;
|
|
|
|
r = -EFAULT;
|
|
if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
|
|
goto out;
|
|
|
|
r = 0;
|
|
out:
|
|
mutex_unlock(&kvm->slots_lock);
|
|
return r;
|
|
}
|
|
|
|
static void unpin_slot(struct kvm_memory_slot *memslot)
|
|
{
|
|
unsigned long *physp;
|
|
unsigned long j, npages, pfn;
|
|
struct page *page;
|
|
|
|
physp = memslot->arch.slot_phys;
|
|
npages = memslot->npages;
|
|
if (!physp)
|
|
return;
|
|
for (j = 0; j < npages; j++) {
|
|
if (!(physp[j] & KVMPPC_GOT_PAGE))
|
|
continue;
|
|
pfn = physp[j] >> PAGE_SHIFT;
|
|
page = pfn_to_page(pfn);
|
|
SetPageDirty(page);
|
|
put_page(page);
|
|
}
|
|
}
|
|
|
|
static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
|
|
struct kvm_memory_slot *dont)
|
|
{
|
|
if (!dont || free->arch.rmap != dont->arch.rmap) {
|
|
vfree(free->arch.rmap);
|
|
free->arch.rmap = NULL;
|
|
}
|
|
if (!dont || free->arch.slot_phys != dont->arch.slot_phys) {
|
|
unpin_slot(free);
|
|
vfree(free->arch.slot_phys);
|
|
free->arch.slot_phys = NULL;
|
|
}
|
|
}
|
|
|
|
static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
|
|
unsigned long npages)
|
|
{
|
|
slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
|
|
if (!slot->arch.rmap)
|
|
return -ENOMEM;
|
|
slot->arch.slot_phys = NULL;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
|
|
struct kvm_memory_slot *memslot,
|
|
struct kvm_userspace_memory_region *mem)
|
|
{
|
|
unsigned long *phys;
|
|
|
|
/* Allocate a slot_phys array if needed */
|
|
phys = memslot->arch.slot_phys;
|
|
if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) {
|
|
phys = vzalloc(memslot->npages * sizeof(unsigned long));
|
|
if (!phys)
|
|
return -ENOMEM;
|
|
memslot->arch.slot_phys = phys;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
|
|
struct kvm_userspace_memory_region *mem,
|
|
const struct kvm_memory_slot *old)
|
|
{
|
|
unsigned long npages = mem->memory_size >> PAGE_SHIFT;
|
|
struct kvm_memory_slot *memslot;
|
|
|
|
if (npages && old->npages) {
|
|
/*
|
|
* If modifying a memslot, reset all the rmap dirty bits.
|
|
* If this is a new memslot, we don't need to do anything
|
|
* since the rmap array starts out as all zeroes,
|
|
* i.e. no pages are dirty.
|
|
*/
|
|
memslot = id_to_memslot(kvm->memslots, mem->slot);
|
|
kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Update LPCR values in kvm->arch and in vcores.
|
|
* Caller must hold kvm->lock.
|
|
*/
|
|
void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
|
|
{
|
|
long int i;
|
|
u32 cores_done = 0;
|
|
|
|
if ((kvm->arch.lpcr & mask) == lpcr)
|
|
return;
|
|
|
|
kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
|
|
|
|
for (i = 0; i < KVM_MAX_VCORES; ++i) {
|
|
struct kvmppc_vcore *vc = kvm->arch.vcores[i];
|
|
if (!vc)
|
|
continue;
|
|
spin_lock(&vc->lock);
|
|
vc->lpcr = (vc->lpcr & ~mask) | lpcr;
|
|
spin_unlock(&vc->lock);
|
|
if (++cores_done >= kvm->arch.online_vcores)
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
|
|
{
|
|
return;
|
|
}
|
|
|
|
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
|
|
{
|
|
int err = 0;
|
|
struct kvm *kvm = vcpu->kvm;
|
|
struct kvm_rma_info *ri = NULL;
|
|
unsigned long hva;
|
|
struct kvm_memory_slot *memslot;
|
|
struct vm_area_struct *vma;
|
|
unsigned long lpcr = 0, senc;
|
|
unsigned long lpcr_mask = 0;
|
|
unsigned long psize, porder;
|
|
unsigned long rma_size;
|
|
unsigned long rmls;
|
|
unsigned long *physp;
|
|
unsigned long i, npages;
|
|
int srcu_idx;
|
|
|
|
mutex_lock(&kvm->lock);
|
|
if (kvm->arch.rma_setup_done)
|
|
goto out; /* another vcpu beat us to it */
|
|
|
|
/* Allocate hashed page table (if not done already) and reset it */
|
|
if (!kvm->arch.hpt_virt) {
|
|
err = kvmppc_alloc_hpt(kvm, NULL);
|
|
if (err) {
|
|
pr_err("KVM: Couldn't alloc HPT\n");
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
/* Look up the memslot for guest physical address 0 */
|
|
srcu_idx = srcu_read_lock(&kvm->srcu);
|
|
memslot = gfn_to_memslot(kvm, 0);
|
|
|
|
/* We must have some memory at 0 by now */
|
|
err = -EINVAL;
|
|
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
|
|
goto out_srcu;
|
|
|
|
/* Look up the VMA for the start of this memory slot */
|
|
hva = memslot->userspace_addr;
|
|
down_read(¤t->mm->mmap_sem);
|
|
vma = find_vma(current->mm, hva);
|
|
if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
|
|
goto up_out;
|
|
|
|
psize = vma_kernel_pagesize(vma);
|
|
porder = __ilog2(psize);
|
|
|
|
/* Is this one of our preallocated RMAs? */
|
|
if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops &&
|
|
hva == vma->vm_start)
|
|
ri = vma->vm_file->private_data;
|
|
|
|
up_read(¤t->mm->mmap_sem);
|
|
|
|
if (!ri) {
|
|
/* On POWER7, use VRMA; on PPC970, give up */
|
|
err = -EPERM;
|
|
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
|
|
pr_err("KVM: CPU requires an RMO\n");
|
|
goto out_srcu;
|
|
}
|
|
|
|
/* We can handle 4k, 64k or 16M pages in the VRMA */
|
|
err = -EINVAL;
|
|
if (!(psize == 0x1000 || psize == 0x10000 ||
|
|
psize == 0x1000000))
|
|
goto out_srcu;
|
|
|
|
/* Update VRMASD field in the LPCR */
|
|
senc = slb_pgsize_encoding(psize);
|
|
kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
|
|
(VRMA_VSID << SLB_VSID_SHIFT_1T);
|
|
lpcr_mask = LPCR_VRMASD;
|
|
/* the -4 is to account for senc values starting at 0x10 */
|
|
lpcr = senc << (LPCR_VRMASD_SH - 4);
|
|
|
|
/* Create HPTEs in the hash page table for the VRMA */
|
|
kvmppc_map_vrma(vcpu, memslot, porder);
|
|
|
|
} else {
|
|
/* Set up to use an RMO region */
|
|
rma_size = kvm_rma_pages;
|
|
if (rma_size > memslot->npages)
|
|
rma_size = memslot->npages;
|
|
rma_size <<= PAGE_SHIFT;
|
|
rmls = lpcr_rmls(rma_size);
|
|
err = -EINVAL;
|
|
if ((long)rmls < 0) {
|
|
pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
|
|
goto out_srcu;
|
|
}
|
|
atomic_inc(&ri->use_count);
|
|
kvm->arch.rma = ri;
|
|
|
|
/* Update LPCR and RMOR */
|
|
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
|
|
/* PPC970; insert RMLS value (split field) in HID4 */
|
|
lpcr_mask = (1ul << HID4_RMLS0_SH) |
|
|
(3ul << HID4_RMLS2_SH) | HID4_RMOR;
|
|
lpcr = ((rmls >> 2) << HID4_RMLS0_SH) |
|
|
((rmls & 3) << HID4_RMLS2_SH);
|
|
/* RMOR is also in HID4 */
|
|
lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff)
|
|
<< HID4_RMOR_SH;
|
|
} else {
|
|
/* POWER7 */
|
|
lpcr_mask = LPCR_VPM0 | LPCR_VRMA_L | LPCR_RMLS;
|
|
lpcr = rmls << LPCR_RMLS_SH;
|
|
kvm->arch.rmor = ri->base_pfn << PAGE_SHIFT;
|
|
}
|
|
pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n",
|
|
ri->base_pfn << PAGE_SHIFT, rma_size, lpcr);
|
|
|
|
/* Initialize phys addrs of pages in RMO */
|
|
npages = kvm_rma_pages;
|
|
porder = __ilog2(npages);
|
|
physp = memslot->arch.slot_phys;
|
|
if (physp) {
|
|
if (npages > memslot->npages)
|
|
npages = memslot->npages;
|
|
spin_lock(&kvm->arch.slot_phys_lock);
|
|
for (i = 0; i < npages; ++i)
|
|
physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) +
|
|
porder;
|
|
spin_unlock(&kvm->arch.slot_phys_lock);
|
|
}
|
|
}
|
|
|
|
kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
|
|
|
|
/* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
|
|
smp_wmb();
|
|
kvm->arch.rma_setup_done = 1;
|
|
err = 0;
|
|
out_srcu:
|
|
srcu_read_unlock(&kvm->srcu, srcu_idx);
|
|
out:
|
|
mutex_unlock(&kvm->lock);
|
|
return err;
|
|
|
|
up_out:
|
|
up_read(¤t->mm->mmap_sem);
|
|
goto out_srcu;
|
|
}
|
|
|
|
static int kvmppc_core_init_vm_hv(struct kvm *kvm)
|
|
{
|
|
unsigned long lpcr, lpid;
|
|
|
|
/* Allocate the guest's logical partition ID */
|
|
|
|
lpid = kvmppc_alloc_lpid();
|
|
if ((long)lpid < 0)
|
|
return -ENOMEM;
|
|
kvm->arch.lpid = lpid;
|
|
|
|
/*
|
|
* Since we don't flush the TLB when tearing down a VM,
|
|
* and this lpid might have previously been used,
|
|
* make sure we flush on each core before running the new VM.
|
|
*/
|
|
cpumask_setall(&kvm->arch.need_tlb_flush);
|
|
|
|
kvm->arch.rma = NULL;
|
|
|
|
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
|
|
|
|
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
|
|
/* PPC970; HID4 is effectively the LPCR */
|
|
kvm->arch.host_lpid = 0;
|
|
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4);
|
|
lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH));
|
|
lpcr |= ((lpid >> 4) << HID4_LPID1_SH) |
|
|
((lpid & 0xf) << HID4_LPID5_SH);
|
|
} else {
|
|
/* POWER7; init LPCR for virtual RMA mode */
|
|
kvm->arch.host_lpid = mfspr(SPRN_LPID);
|
|
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
|
|
lpcr &= LPCR_PECE | LPCR_LPES;
|
|
lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
|
|
LPCR_VPM0 | LPCR_VPM1;
|
|
kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
|
|
(VRMA_VSID << SLB_VSID_SHIFT_1T);
|
|
}
|
|
kvm->arch.lpcr = lpcr;
|
|
|
|
kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
|
|
spin_lock_init(&kvm->arch.slot_phys_lock);
|
|
|
|
/*
|
|
* Don't allow secondary CPU threads to come online
|
|
* while any KVM VMs exist.
|
|
*/
|
|
inhibit_secondary_onlining();
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void kvmppc_free_vcores(struct kvm *kvm)
|
|
{
|
|
long int i;
|
|
|
|
for (i = 0; i < KVM_MAX_VCORES; ++i)
|
|
kfree(kvm->arch.vcores[i]);
|
|
kvm->arch.online_vcores = 0;
|
|
}
|
|
|
|
static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
|
|
{
|
|
uninhibit_secondary_onlining();
|
|
|
|
kvmppc_free_vcores(kvm);
|
|
if (kvm->arch.rma) {
|
|
kvm_release_rma(kvm->arch.rma);
|
|
kvm->arch.rma = NULL;
|
|
}
|
|
|
|
kvmppc_free_hpt(kvm);
|
|
}
|
|
|
|
/* We don't need to emulate any privileged instructions or dcbz */
|
|
static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
|
|
unsigned int inst, int *advance)
|
|
{
|
|
return EMULATE_FAIL;
|
|
}
|
|
|
|
static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
|
|
ulong spr_val)
|
|
{
|
|
return EMULATE_FAIL;
|
|
}
|
|
|
|
static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
|
|
ulong *spr_val)
|
|
{
|
|
return EMULATE_FAIL;
|
|
}
|
|
|
|
static int kvmppc_core_check_processor_compat_hv(void)
|
|
{
|
|
if (!cpu_has_feature(CPU_FTR_HVMODE))
|
|
return -EIO;
|
|
return 0;
|
|
}
|
|
|
|
static long kvm_arch_vm_ioctl_hv(struct file *filp,
|
|
unsigned int ioctl, unsigned long arg)
|
|
{
|
|
struct kvm *kvm __maybe_unused = filp->private_data;
|
|
void __user *argp = (void __user *)arg;
|
|
long r;
|
|
|
|
switch (ioctl) {
|
|
|
|
case KVM_ALLOCATE_RMA: {
|
|
struct kvm_allocate_rma rma;
|
|
struct kvm *kvm = filp->private_data;
|
|
|
|
r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
|
|
if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
|
|
r = -EFAULT;
|
|
break;
|
|
}
|
|
|
|
case KVM_PPC_ALLOCATE_HTAB: {
|
|
u32 htab_order;
|
|
|
|
r = -EFAULT;
|
|
if (get_user(htab_order, (u32 __user *)argp))
|
|
break;
|
|
r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
|
|
if (r)
|
|
break;
|
|
r = -EFAULT;
|
|
if (put_user(htab_order, (u32 __user *)argp))
|
|
break;
|
|
r = 0;
|
|
break;
|
|
}
|
|
|
|
case KVM_PPC_GET_HTAB_FD: {
|
|
struct kvm_get_htab_fd ghf;
|
|
|
|
r = -EFAULT;
|
|
if (copy_from_user(&ghf, argp, sizeof(ghf)))
|
|
break;
|
|
r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
|
|
break;
|
|
}
|
|
|
|
default:
|
|
r = -ENOTTY;
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
static struct kvmppc_ops kvm_ops_hv = {
|
|
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
|
|
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
|
|
.get_one_reg = kvmppc_get_one_reg_hv,
|
|
.set_one_reg = kvmppc_set_one_reg_hv,
|
|
.vcpu_load = kvmppc_core_vcpu_load_hv,
|
|
.vcpu_put = kvmppc_core_vcpu_put_hv,
|
|
.set_msr = kvmppc_set_msr_hv,
|
|
.vcpu_run = kvmppc_vcpu_run_hv,
|
|
.vcpu_create = kvmppc_core_vcpu_create_hv,
|
|
.vcpu_free = kvmppc_core_vcpu_free_hv,
|
|
.check_requests = kvmppc_core_check_requests_hv,
|
|
.get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv,
|
|
.flush_memslot = kvmppc_core_flush_memslot_hv,
|
|
.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
|
|
.commit_memory_region = kvmppc_core_commit_memory_region_hv,
|
|
.unmap_hva = kvm_unmap_hva_hv,
|
|
.unmap_hva_range = kvm_unmap_hva_range_hv,
|
|
.age_hva = kvm_age_hva_hv,
|
|
.test_age_hva = kvm_test_age_hva_hv,
|
|
.set_spte_hva = kvm_set_spte_hva_hv,
|
|
.mmu_destroy = kvmppc_mmu_destroy_hv,
|
|
.free_memslot = kvmppc_core_free_memslot_hv,
|
|
.create_memslot = kvmppc_core_create_memslot_hv,
|
|
.init_vm = kvmppc_core_init_vm_hv,
|
|
.destroy_vm = kvmppc_core_destroy_vm_hv,
|
|
.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
|
|
.emulate_op = kvmppc_core_emulate_op_hv,
|
|
.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
|
|
.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
|
|
.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
|
|
.arch_vm_ioctl = kvm_arch_vm_ioctl_hv,
|
|
};
|
|
|
|
static int kvmppc_book3s_init_hv(void)
|
|
{
|
|
int r;
|
|
/*
|
|
* FIXME!! Do we need to check on all cpus ?
|
|
*/
|
|
r = kvmppc_core_check_processor_compat_hv();
|
|
if (r < 0)
|
|
return r;
|
|
|
|
kvm_ops_hv.owner = THIS_MODULE;
|
|
kvmppc_hv_ops = &kvm_ops_hv;
|
|
|
|
r = kvmppc_mmu_hv_init();
|
|
return r;
|
|
}
|
|
|
|
static void kvmppc_book3s_exit_hv(void)
|
|
{
|
|
kvmppc_hv_ops = NULL;
|
|
}
|
|
|
|
module_init(kvmppc_book3s_init_hv);
|
|
module_exit(kvmppc_book3s_exit_hv);
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_ALIAS_MISCDEV(KVM_MINOR);
|
|
MODULE_ALIAS("devname:kvm");
|