Merge branch 'kvm-older-features' into HEAD

Merge branch for features that did not make it into 5.18: * New ioctls to get/set TSC frequency for a whole VM * Allow userspace to opt out of hypercall patching Nested virtualization improvements for AMD: * Support for "nested nested" optimizations (nested vVMLOAD/VMSAVE, nested vGIF) * Allow AVIC to co-exist with a nested guest running * Fixes for LBR virtualizations when a nested guest is running, and nested LBR virtualization support * PAUSE filtering for nested hypervisors Guest support: * Decoupling of vcpu_is_preempted from PV spinlocks Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2024-12-13 14:04:05 +08:00 · 2022-04-08 12:43:40 -04:00 · 2022-04-08 12:43:40 -04:00 · a4cfff3f0f
commit a4cfff3f0f
parent 42dcbe7d8b 8d5678a766
25 changed files with 2543 additions and 561 deletions
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@ -982,12 +982,22 @@ memory.
 	__u8 pad2[30];
  };
-If the KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag is returned from the
+If certain flags are returned from the KVM_CAP_XEN_HVM check, they may
-KVM_CAP_XEN_HVM check, it may be set in the flags field of this ioctl.
+be set in the flags field of this ioctl:
-This requests KVM to generate the contents of the hypercall page
+
-automatically; hypercalls will be intercepted and passed to userspace
+The KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL flag requests KVM to generate
-through KVM_EXIT_XEN.  In this case, all of the blob size and address
+the contents of the hypercall page automatically; hypercalls will be
-fields must be zero.
+intercepted and passed to userspace through KVM_EXIT_XEN.  In this
 ase, all of the blob size and address fields must be zero.
 The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates to KVM that userspace
 will always use the KVM_XEN_HVM_EVTCHN_SEND ioctl to deliver event
 channel interrupts rather than manipulating the guest's shared_info
 structures directly. This, in turn, may allow KVM to enable features
 such as intercepting the SCHEDOP_poll hypercall to accelerate PV
 spinlock operation for the guest. Userspace may still use the ioctl
 to deliver events if it was advertised, even if userspace does not
 send this indication that it will always do so
 No other flags are currently valid in the struct kvm_xen_hvm_config.
@ -1887,22 +1897,25 @@ the future.
 4.55 KVM_SET_TSC_KHZ
 --------------------
-:Capability: KVM_CAP_TSC_CONTROL
+:Capability: KVM_CAP_TSC_CONTROL / KVM_CAP_VM_TSC_CONTROL
 :Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
 :Parameters: virtual tsc_khz
 :Returns: 0 on success, -1 on error
 Specifies the tsc frequency for the virtual machine. The unit of the
 frequency is KHz.
 If the KVM_CAP_VM_TSC_CONTROL capability is advertised, this can also
 be used as a vm ioctl to set the initial tsc frequency of subsequently
 created vCPUs.
 4.56 KVM_GET_TSC_KHZ
 --------------------
-:Capability: KVM_CAP_GET_TSC_KHZ
+:Capability: KVM_CAP_GET_TSC_KHZ / KVM_CAP_VM_TSC_CONTROL
 :Architectures: x86
-:Type: vcpu ioctl
+:Type: vcpu ioctl / vm ioctl
 :Parameters: none
 :Returns: virtual tsc-khz on success, negative value on error
@ -5216,7 +5229,25 @@ have deterministic behavior.
 		struct {
 			__u64 gfn;
 		} shared_info;
-		__u64 pad[4];
+		struct {
 			__u32 send_port;
 			__u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
 			__u32 flags;
 			union {
 				struct {
 					__u32 port;
 					__u32 vcpu;
 					__u32 priority;
 				} port;
 				struct {
 					__u32 port; /* Zero for eventfd */
 					__s32 fd;
 				} eventfd;
 				__u32 padding[4];
 			} deliver;
 		} evtchn;
 		__u32 xen_version;
 		__u64 pad[8];
 	} u;
  };
@ -5247,6 +5278,30 @@ KVM_XEN_ATTR_TYPE_SHARED_INFO
 KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
  Sets the exception vector used to deliver Xen event channel upcalls.
  This is the HVM-wide vector injected directly by the hypervisor
  (not through the local APIC), typically configured by a guest via
  HVM_PARAM_CALLBACK_IRQ.
 KVM_XEN_ATTR_TYPE_EVTCHN
  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
  an outbound port number for interception of EVTCHNOP_send requests
  from the guest. A given sending port number may be directed back
  to a specified vCPU (by APIC ID) / port / priority on the guest,
  or to trigger events on an eventfd. The vCPU and priority can be
  changed by setting KVM_XEN_EVTCHN_UPDATE in a subsequent call,
  but other fields cannot change for a given sending port. A port
  mapping is removed by using KVM_XEN_EVTCHN_DEASSIGN in the flags
  field.
 KVM_XEN_ATTR_TYPE_XEN_VERSION
  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It configures
  the 32-bit version code returned to the guest when it invokes the
  XENVER_version call; typically (XEN_MAJOR << 16 | XEN_MINOR). PV
  Xen guests will often use this to as a dummy hypercall to trigger
  event channel delivery, so responding within the kernel without
  exiting to userspace is beneficial.
 4.127 KVM_XEN_HVM_GET_ATTR
 --------------------------
@ -5258,7 +5313,8 @@ KVM_XEN_ATTR_TYPE_UPCALL_VECTOR
 :Returns: 0 on success, < 0 on error
 Allows Xen VM attributes to be read. For the structure and types,
-see KVM_XEN_HVM_SET_ATTR above.
+see KVM_XEN_HVM_SET_ATTR above. The KVM_XEN_ATTR_TYPE_EVTCHN
 attribute cannot be read.
 4.128 KVM_XEN_VCPU_SET_ATTR
 ---------------------------
@ -5285,6 +5341,13 @@ see KVM_XEN_HVM_SET_ATTR above.
 			__u64 time_blocked;
 			__u64 time_offline;
 		} runstate;
 		__u32 vcpu_id;
 		struct {
 			__u32 port;
 			__u32 priority;
 			__u64 expires_ns;
 		} timer;
 		__u8 vector;
 	} u;
  };
@ -5326,6 +5389,27 @@ KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST
  or RUNSTATE_offline) to set the current accounted state as of the
  adjusted state_entry_time.
 KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID
  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the Xen
  vCPU ID of the given vCPU, to allow timer-related VCPU operations to
  be intercepted by KVM.
 KVM_XEN_VCPU_ATTR_TYPE_TIMER
  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
  event channel port/priority for the VIRQ_TIMER of the vCPU, as well
  as allowing a pending timer to be saved/restored.
 KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR
  This attribute is available when the KVM_CAP_XEN_HVM ioctl indicates
  support for KVM_XEN_HVM_CONFIG_EVTCHN_SEND features. It sets the
  per-vCPU local APIC upcall vector, configured by a Xen guest with
  the HVMOP_set_evtchn_upcall_vector hypercall. This is typically
  used by Windows guests, and is distinct from the HVM-wide upcall
  vector configured with HVM_PARAM_CALLBACK_IRQ.
 4.129 KVM_XEN_VCPU_GET_ATTR
 ---------------------------
@ -5645,6 +5729,25 @@ enabled with ``arch_prctl()``, but this may change in the future.
 The offsets of the state save areas in struct kvm_xsave follow the contents
 of CPUID leaf 0xD on the host.
 4.135 KVM_XEN_HVM_EVTCHN_SEND
 -----------------------------
 :Capability: KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND
 :Architectures: x86
 :Type: vm ioctl
 :Parameters: struct kvm_irq_routing_xen_evtchn
 :Returns: 0 on success, < 0 on error
 ::
   struct kvm_irq_routing_xen_evtchn {
 	__u32 port;
 	__u32 vcpu;
 	__u32 priority;
   };
 This ioctl injects an event channel interrupt directly to the guest vCPU.
 5. The kvm_run structure
 ========================
@ -7135,6 +7238,15 @@ The valid bits in cap.args[0] are:
                                    Additionally, when this quirk is disabled,
                                    KVM clears CPUID.01H:ECX[bit 3] if
                                    IA32_MISC_ENABLE[bit 18] is cleared.
 KVM_X86_QUIRK_FIX_HYPERCALL_INSN   By default, KVM rewrites guest
                                    VMMCALL/VMCALL instructions to match the
                                    vendor's hypercall instruction for the
                                    system. When this quirk is disabled, KVM
                                    will no longer rewrite invalid guest
                                    hypercall instructions. Executing the
                                    incorrect hypercall instruction will
                                    generate a #UD within the guest.
 =================================== ============================================
 8. Other capabilities.
@ -7612,8 +7724,9 @@ PVHVM guests. Valid flags are::
  #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR	(1 << 0)
  #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL	(1 << 1)
  #define KVM_XEN_HVM_CONFIG_SHARED_INFO	(1 << 2)
-  #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 2)
+  #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
-  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 3)
+  #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
  #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND	(1 << 5)
 The KVM_XEN_HVM_CONFIG_HYPERCALL_MSR flag indicates that the KVM_XEN_HVM_CONFIG
 ioctl is available, for the guest to set its hypercall page.
@ -7637,6 +7750,14 @@ The KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL flag indicates that IRQ routing entries
 of the type KVM_IRQ_ROUTING_XEN_EVTCHN are supported, with the priority
 field set to indicate 2 level event channel delivery.
 The KVM_XEN_HVM_CONFIG_EVTCHN_SEND flag indicates that KVM supports
 injecting event channel events directly into the guest with the
 KVM_XEN_HVM_EVTCHN_SEND ioctl. It also indicates support for the
 KVM_XEN_ATTR_TYPE_EVTCHN/XEN_VERSION HVM attributes and the
 KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID/TIMER/UPCALL_VECTOR vCPU attributes.
 related to event channel delivery, timers, and the XENVER_version
 interception.
 8.31 KVM_CAP_PPC_MULTITCE
 -------------------------
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@ -126,6 +126,7 @@ KVM_X86_OP_OPTIONAL(migrate_timers)
 KVM_X86_OP(msr_filter_changed)
 KVM_X86_OP(complete_emulated_msr)
 KVM_X86_OP(vcpu_deliver_sipi_vector)
 KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
 #undef KVM_X86_OP
 #undef KVM_X86_OP_OPTIONAL
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@ -607,16 +607,21 @@ struct kvm_vcpu_hv {
 struct kvm_vcpu_xen {
 	u64 hypercall_rip;
 	u32 current_runstate;
-	bool vcpu_info_set;
+	u8 upcall_vector;
-	bool vcpu_time_info_set;
+	struct gfn_to_pfn_cache vcpu_info_cache;
-	bool runstate_set;
+	struct gfn_to_pfn_cache vcpu_time_info_cache;
-	struct gfn_to_hva_cache vcpu_info_cache;
+	struct gfn_to_pfn_cache runstate_cache;
 	struct gfn_to_hva_cache vcpu_time_info_cache;
 	struct gfn_to_hva_cache runstate_cache;
 	u64 last_steal;
 	u64 runstate_entry_time;
 	u64 runstate_times[4];
 	unsigned long evtchn_pending_sel;
 	u32 vcpu_id; /* The Xen / ACPI vCPU ID */
 	u32 timer_virq;
 	u64 timer_expires; /* In guest epoch */
 	atomic_t timer_pending;
 	struct hrtimer timer;
 	int poll_evtchn;
 	struct timer_list poll_timer;
 };
 struct kvm_vcpu_arch {
@ -753,8 +758,7 @@ struct kvm_vcpu_arch {
 	gpa_t time;
 	struct pvclock_vcpu_time_info hv_clock;
 	unsigned int hw_tsc_khz;
-	struct gfn_to_hva_cache pv_time;
+	struct gfn_to_pfn_cache pv_time;
 	bool pv_time_enabled;
 	/* set guest stopped flag in pvclock flags field */
 	bool pvclock_set_guest_stopped_request;
@ -1024,9 +1028,12 @@ struct msr_bitmap_range {
 /* Xen emulation context */
 struct kvm_xen {
 	u32 xen_version;
 	bool long_mode;
 	u8 upcall_vector;
 	struct gfn_to_pfn_cache shinfo_cache;
 	struct idr evtchn_ports;
 	unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
 };
 enum kvm_irqchip_mode {
@ -1119,6 +1126,8 @@ struct kvm_arch {
 	u64 cur_tsc_generation;
 	int nr_vcpus_matched_tsc;
 	u32 default_tsc_khz;
 	seqcount_raw_spinlock_t pvclock_sc;
 	bool use_master_clock;
 	u64 master_kernel_ns;
@ -1498,6 +1507,11 @@ struct kvm_x86_ops {
 	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
 	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
 	/*
 	 * Returns vCPU specific APICv inhibit reasons
 	 */
 	unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
 };
 struct kvm_x86_nested_ops {
@ -1799,6 +1813,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception);
 bool kvm_apicv_activated(struct kvm *kvm);
 bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
 void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
 				      enum kvm_apicv_inhibit reason, bool set);
@ -1988,6 +2003,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
 	 KVM_X86_QUIRK_CD_NW_CLEARED |		\
 	 KVM_X86_QUIRK_LAPIC_MMIO_HOLE |	\
 	 KVM_X86_QUIRK_OUT_7E_INC_RIP |		\
-	 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)
+	 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT |	\
 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
 #endif /* _ASM_X86_KVM_HOST_H */
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@ -428,11 +428,12 @@ struct kvm_sync_regs {
 	struct kvm_vcpu_events events;
 };
-#define KVM_X86_QUIRK_LINT0_REENABLED	   (1 << 0)
+#define KVM_X86_QUIRK_LINT0_REENABLED		(1 << 0)
-#define KVM_X86_QUIRK_CD_NW_CLEARED	   (1 << 1)
+#define KVM_X86_QUIRK_CD_NW_CLEARED		(1 << 1)
-#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE	   (1 << 2)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE		(1 << 2)
-#define KVM_X86_QUIRK_OUT_7E_INC_RIP	   (1 << 3)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP		(1 << 3)
-#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT	(1 << 4)
 #define KVM_X86_QUIRK_FIX_HYPERCALL_INSN	(1 << 5)
 #define KVM_STATE_NESTED_FORMAT_VMX	0
 #define KVM_STATE_NESTED_FORMAT_SVM	1
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@ -5,7 +5,7 @@
 #include <asm/ia32.h>
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
 #include <asm/kvm_para.h>
 #endif
@ -20,7 +20,7 @@ int main(void)
 	BLANK();
 #endif
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
 	OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
 	BLANK();
 #endif
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@ -752,6 +752,42 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
 }
 #endif
 #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
 bool __kvm_vcpu_is_preempted(long cpu);
 __visible bool __kvm_vcpu_is_preempted(long cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
 	return !!(src->preempted & KVM_VCPU_PREEMPTED);
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 #else
 #include <asm/asm-offsets.h>
 extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
 /*
 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
 * restoring to/from the stack.
 */
 asm(
 ".pushsection .text;"
 ".global __raw_callee_save___kvm_vcpu_is_preempted;"
 ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
 "__raw_callee_save___kvm_vcpu_is_preempted:"
 ASM_ENDBR
 "movq	__per_cpu_offset(,%rdi,8), %rax;"
 "cmpb	$0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
 "setne	%al;"
 ASM_RET
 ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
 ".popsection");
 #endif
 static void __init kvm_guest_init(void)
 {
 	int i;
@ -764,6 +800,9 @@ static void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 		has_steal_clock = 1;
 		static_call_update(pv_steal_clock, kvm_steal_clock);
 		pv_ops.lock.vcpu_is_preempted =
 			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
 	}
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@ -1005,40 +1044,6 @@ static void kvm_wait(u8 *ptr, u8 val)
 	}
 }
 #ifdef CONFIG_X86_32
 __visible bool __kvm_vcpu_is_preempted(long cpu)
 {
 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
 	return !!(src->preempted & KVM_VCPU_PREEMPTED);
 }
 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
 #else
 #include <asm/asm-offsets.h>
 extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
 /*
 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
 * restoring to/from the stack.
 */
 asm(
 ".pushsection .text;"
 ".global __raw_callee_save___kvm_vcpu_is_preempted;"
 ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
 "__raw_callee_save___kvm_vcpu_is_preempted:"
 ASM_ENDBR
 "movq	__per_cpu_offset(,%rdi,8), %rax;"
 "cmpb	$0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
 "setne	%al;"
 ASM_RET
 ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
 ".popsection");
 #endif
 /*
 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
 */
@ -1082,10 +1087,6 @@ void __init kvm_spinlock_init(void)
 	pv_ops.lock.wait = kvm_wait;
 	pv_ops.lock.kick = kvm_kick_cpu;
 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 		pv_ops.lock.vcpu_is_preempted =
 			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
 	}
 	/*
 	 * When PV spinlock is enabled which is preferred over
 	 * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@ -252,7 +252,6 @@ int kvm_pic_read_irq(struct kvm *kvm)
 				 */
 				irq2 = 7;
 			intno = s->pics[1].irq_base + irq2;
 			irq = irq2 + 8;
 		} else
 			intno = s->pics[0].irq_base + irq;
 	} else {
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@ -22,10 +22,14 @@
 */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	if (lapic_in_kernel(vcpu))
+	int r = 0;
 		return apic_has_pending_timer(vcpu);
-	return 0;
+	if (lapic_in_kernel(vcpu))
 		r = apic_has_pending_timer(vcpu);
 	if (kvm_xen_timer_enabled(vcpu))
 		r += kvm_xen_has_pending_timer(vcpu);
 	return r;
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	if (lapic_in_kernel(vcpu))
 		kvm_inject_apic_timer_irqs(vcpu);
 	if (kvm_xen_timer_enabled(vcpu))
 		kvm_xen_inject_timer_irqs(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@ -181,7 +181,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 		if (!level)
 			return -1;
-		return kvm_xen_set_evtchn_fast(e, kvm);
+		return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
 #endif
 	default:
 		break;
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@ -1866,17 +1866,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)])	\
 		if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			 struct list_head *invalid_list)
 {
 	int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
-	if (ret < 0) {
+	if (ret < 0)
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-		return false;
+	return ret;
 	}
 	return !!ret;
 }
 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@ -1998,7 +1995,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
 		for_each_sp(pages, sp, parents, i) {
 			kvm_unlink_unsync_page(vcpu->kvm, sp);
-			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+			flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
 			mmu_pages_clear_parents(&parents);
 		}
 		if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
@ -2039,6 +2036,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	struct hlist_head *sp_list;
 	unsigned quadrant;
 	struct kvm_mmu_page *sp;
 	int ret;
 	int collisions = 0;
 	LIST_HEAD(invalid_list);
@ -2091,11 +2089,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			 * If the sync fails, the page is zapped.  If so, break
 			 * in order to rebuild it.
 			 */
-			if (!kvm_sync_page(vcpu, sp, &invalid_list))
+			ret = kvm_sync_page(vcpu, sp, &invalid_list);
 			if (ret < 0)
 				break;
 			WARN_ON(!list_empty(&invalid_list));
-			kvm_flush_remote_tlbs(vcpu->kvm);
+			if (ret > 0)
 				kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 		__clear_sp_write_flooding_count(sp);
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@ -165,9 +165,8 @@ free_avic:
 	return err;
 }
-void avic_init_vmcb(struct vcpu_svm *svm)
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
 {
 	struct vmcb *vmcb = svm->vmcb;
 	struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
 	phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
 	phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
@ -357,6 +356,13 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
 	return 1;
 }
 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu))
 		return APICV_INHIBIT_REASON_NESTED;
 	return 0;
 }
 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
 {
 	struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@ -36,40 +36,43 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
 				       struct x86_exception *fault)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
-	if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+	if (vmcb->control.exit_code != SVM_EXIT_NPF) {
 		/*
 		 * TODO: track the cause of the nested page fault, and
 		 * correctly fill in the high bits of exit_info_1.
 		 */
-		svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+		vmcb->control.exit_code = SVM_EXIT_NPF;
-		svm->vmcb->control.exit_code_hi = 0;
+		vmcb->control.exit_code_hi = 0;
-		svm->vmcb->control.exit_info_1 = (1ULL << 32);
+		vmcb->control.exit_info_1 = (1ULL << 32);
-		svm->vmcb->control.exit_info_2 = fault->address;
+		vmcb->control.exit_info_2 = fault->address;
 	}
-	svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
+	vmcb->control.exit_info_1 &= ~0xffffffffULL;
-	svm->vmcb->control.exit_info_1 |= fault->error_code;
+	vmcb->control.exit_info_1 |= fault->error_code;
 	nested_svm_vmexit(svm);
 }
 static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
-       struct vcpu_svm *svm = to_svm(vcpu);
+	struct vcpu_svm *svm = to_svm(vcpu);
-       WARN_ON(!is_guest_mode(vcpu));
+	struct vmcb *vmcb = svm->vmcb;
 	WARN_ON(!is_guest_mode(vcpu));
 	if (vmcb12_is_intercept(&svm->nested.ctl,
 				INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
-	    !svm->nested.nested_run_pending) {
+				!svm->nested.nested_run_pending) {
-               svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
+		vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
-               svm->vmcb->control.exit_code_hi = 0;
+		vmcb->control.exit_code_hi = 0;
-               svm->vmcb->control.exit_info_1 = fault->error_code;
+		vmcb->control.exit_info_1 = fault->error_code;
-               svm->vmcb->control.exit_info_2 = fault->address;
+		vmcb->control.exit_info_2 = fault->address;
-               nested_svm_vmexit(svm);
+		nested_svm_vmexit(svm);
-       } else {
+	} else {
-               kvm_inject_page_fault(vcpu, fault);
+		kvm_inject_page_fault(vcpu, fault);
-       }
+	}
 }
 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
@ -121,6 +124,20 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
 }
 static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
 {
 	if (!svm->v_vmload_vmsave_enabled)
 		return true;
 	if (!nested_npt_enabled(svm))
 		return true;
 	if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
 		return true;
 	return false;
 }
 void recalc_intercepts(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *c, *h;
@ -162,8 +179,17 @@ void recalc_intercepts(struct vcpu_svm *svm)
 	if (!intercept_smi)
 		vmcb_clr_intercept(c, INTERCEPT_SMI);
-	vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+	if (nested_vmcb_needs_vls_intercept(svm)) {
-	vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+		/*
 		 * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
 		 * we must intercept these instructions to correctly
 		 * emulate them in case L1 doesn't intercept them.
 		 */
 		vmcb_set_intercept(c, INTERCEPT_VMLOAD);
 		vmcb_set_intercept(c, INTERCEPT_VMSAVE);
 	} else {
 		WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
 	}
 }
 /*
@ -413,6 +439,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
 		 */
 		mask &= ~V_IRQ_MASK;
 	}
 	if (nested_vgif_enabled(svm))
 		mask |= V_GIF_MASK;
 	svm->nested.ctl.int_ctl        &= ~mask;
 	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
 }
@ -454,11 +484,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
 	vmcb12->control.exit_int_info = exit_int_info;
 }
 static inline bool nested_npt_enabled(struct vcpu_svm *svm)
 {
 	return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
 }
 static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
 {
 	/*
@ -515,6 +540,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
 static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
 {
 	bool new_vmcb12 = false;
 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
 	nested_vmcb02_compute_g_pat(svm);
@ -526,18 +553,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
 	}
 	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
-		svm->vmcb->save.es = vmcb12->save.es;
+		vmcb02->save.es = vmcb12->save.es;
-		svm->vmcb->save.cs = vmcb12->save.cs;
+		vmcb02->save.cs = vmcb12->save.cs;
-		svm->vmcb->save.ss = vmcb12->save.ss;
+		vmcb02->save.ss = vmcb12->save.ss;
-		svm->vmcb->save.ds = vmcb12->save.ds;
+		vmcb02->save.ds = vmcb12->save.ds;
-		svm->vmcb->save.cpl = vmcb12->save.cpl;
+		vmcb02->save.cpl = vmcb12->save.cpl;
-		vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+		vmcb_mark_dirty(vmcb02, VMCB_SEG);
 	}
 	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
-		svm->vmcb->save.gdtr = vmcb12->save.gdtr;
+		vmcb02->save.gdtr = vmcb12->save.gdtr;
-		svm->vmcb->save.idtr = vmcb12->save.idtr;
+		vmcb02->save.idtr = vmcb12->save.idtr;
-		vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+		vmcb_mark_dirty(vmcb02, VMCB_DT);
 	}
 	kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
@ -554,47 +581,59 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
 	kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
 	/* In case we don't even reach vcpu_run, the fields are not updated */
-	svm->vmcb->save.rax = vmcb12->save.rax;
+	vmcb02->save.rax = vmcb12->save.rax;
-	svm->vmcb->save.rsp = vmcb12->save.rsp;
+	vmcb02->save.rsp = vmcb12->save.rsp;
-	svm->vmcb->save.rip = vmcb12->save.rip;
+	vmcb02->save.rip = vmcb12->save.rip;
 	/* These bits will be set properly on the first execution when new_vmc12 is true */
 	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
-		svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+		vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
 		svm->vcpu.arch.dr6  = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
-		vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+		vmcb_mark_dirty(vmcb02, VMCB_DR);
 	}
 	if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
 		/*
 		 * Reserved bits of DEBUGCTL are ignored.  Be consistent with
 		 * svm_set_msr's definition of reserved bits.
 		 */
 		svm_copy_lbrs(vmcb02, vmcb12);
 		vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
 		svm_update_lbrv(&svm->vcpu);
 	} else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
 		svm_copy_lbrs(vmcb02, vmcb01);
 	}
 }
 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 {
-	const u32 int_ctl_vmcb01_bits =
+	u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
-		V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;
+	u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
 	const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
 	/*
 	 * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
 	 * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
 	 */
-	/*
+	if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
-	 * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
+		int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
-	 * avic_physical_id.
+	else
-	 */
+		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
 	WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
 	/* Copied from vmcb01.  msrpm_base can be overwritten later.  */
-	svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
+	vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
-	svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
+	vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
-	svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+	vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
 	/* Done at vmrun: asid.  */
 	/* Also overwritten later if necessary.  */
-	svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+	vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
 	/* nested_cr3.  */
 	if (nested_npt_enabled(svm))
@ -605,21 +644,53 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
 			svm->nested.ctl.tsc_offset,
 			svm->tsc_ratio_msr);
-	svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+	vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
 	if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
 		WARN_ON(!svm->tsc_scaling_enabled);
 		nested_svm_update_tsc_ratio_msr(vcpu);
 	}
-	svm->vmcb->control.int_ctl             =
+	vmcb02->control.int_ctl             =
 		(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
-		(svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
+		(vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
-	svm->vmcb->control.int_vector          = svm->nested.ctl.int_vector;
+	vmcb02->control.int_vector          = svm->nested.ctl.int_vector;
-	svm->vmcb->control.int_state           = svm->nested.ctl.int_state;
+	vmcb02->control.int_state           = svm->nested.ctl.int_state;
-	svm->vmcb->control.event_inj           = svm->nested.ctl.event_inj;
+	vmcb02->control.event_inj           = svm->nested.ctl.event_inj;
-	svm->vmcb->control.event_inj_err       = svm->nested.ctl.event_inj_err;
+	vmcb02->control.event_inj_err       = svm->nested.ctl.event_inj_err;
 	vmcb02->control.virt_ext            = vmcb01->control.virt_ext &
 					      LBR_CTL_ENABLE_MASK;
 	if (svm->lbrv_enabled)
 		vmcb02->control.virt_ext  |=
 			(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
 	if (!nested_vmcb_needs_vls_intercept(svm))
 		vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
 	if (kvm_pause_in_guest(svm->vcpu.kvm)) {
 		/* use guest values since host doesn't use them */
 		vmcb02->control.pause_filter_count =
 				svm->pause_filter_enabled ?
 				svm->nested.ctl.pause_filter_count : 0;
 		vmcb02->control.pause_filter_thresh =
 				svm->pause_threshold_enabled ?
 				svm->nested.ctl.pause_filter_thresh : 0;
 	} else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
 		/* use host values when guest doesn't use them */
 		vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
 		vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
 	} else {
 		/*
 		 * Intercept every PAUSE otherwise and
 		 * ignore both host and guest values
 		 */
 		vmcb02->control.pause_filter_count = 0;
 		vmcb02->control.pause_filter_thresh = 0;
 	}
 	nested_svm_transition_tlb_flush(vcpu);
@ -688,6 +759,9 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
 	svm_set_gif(svm, true);
 	if (kvm_vcpu_apicv_active(vcpu))
 		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
 	return 0;
 }
@ -698,6 +772,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
 	struct vmcb *vmcb12;
 	struct kvm_host_map map;
 	u64 vmcb12_gpa;
 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
 	if (!svm->nested.hsave_msr) {
 		kvm_inject_gp(vcpu, 0);
@ -741,14 +816,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
 	 * Since vmcb01 is not in use, we can use it to store some of the L1
 	 * state.
 	 */
-	svm->vmcb01.ptr->save.efer   = vcpu->arch.efer;
+	vmcb01->save.efer   = vcpu->arch.efer;
-	svm->vmcb01.ptr->save.cr0    = kvm_read_cr0(vcpu);
+	vmcb01->save.cr0    = kvm_read_cr0(vcpu);
-	svm->vmcb01.ptr->save.cr4    = vcpu->arch.cr4;
+	vmcb01->save.cr4    = vcpu->arch.cr4;
-	svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
+	vmcb01->save.rflags = kvm_get_rflags(vcpu);
-	svm->vmcb01.ptr->save.rip    = kvm_rip_read(vcpu);
+	vmcb01->save.rip    = kvm_rip_read(vcpu);
 	if (!npt_enabled)
-		svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
+		vmcb01->save.cr3 = kvm_read_cr3(vcpu);
 	svm->nested.nested_run_pending = 1;
@ -814,8 +889,9 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 int nested_svm_vmexit(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
 	struct vmcb *vmcb12;
 	struct vmcb *vmcb = svm->vmcb;
 	struct kvm_host_map map;
 	int rc;
@ -843,57 +919,68 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	/* Give the current vmcb to the guest */
-	vmcb12->save.es     = vmcb->save.es;
+	vmcb12->save.es     = vmcb02->save.es;
-	vmcb12->save.cs     = vmcb->save.cs;
+	vmcb12->save.cs     = vmcb02->save.cs;
-	vmcb12->save.ss     = vmcb->save.ss;
+	vmcb12->save.ss     = vmcb02->save.ss;
-	vmcb12->save.ds     = vmcb->save.ds;
+	vmcb12->save.ds     = vmcb02->save.ds;
-	vmcb12->save.gdtr   = vmcb->save.gdtr;
+	vmcb12->save.gdtr   = vmcb02->save.gdtr;
-	vmcb12->save.idtr   = vmcb->save.idtr;
+	vmcb12->save.idtr   = vmcb02->save.idtr;
 	vmcb12->save.efer   = svm->vcpu.arch.efer;
 	vmcb12->save.cr0    = kvm_read_cr0(vcpu);
 	vmcb12->save.cr3    = kvm_read_cr3(vcpu);
-	vmcb12->save.cr2    = vmcb->save.cr2;
+	vmcb12->save.cr2    = vmcb02->save.cr2;
 	vmcb12->save.cr4    = svm->vcpu.arch.cr4;
 	vmcb12->save.rflags = kvm_get_rflags(vcpu);
 	vmcb12->save.rip    = kvm_rip_read(vcpu);
 	vmcb12->save.rsp    = kvm_rsp_read(vcpu);
 	vmcb12->save.rax    = kvm_rax_read(vcpu);
-	vmcb12->save.dr7    = vmcb->save.dr7;
+	vmcb12->save.dr7    = vmcb02->save.dr7;
 	vmcb12->save.dr6    = svm->vcpu.arch.dr6;
-	vmcb12->save.cpl    = vmcb->save.cpl;
+	vmcb12->save.cpl    = vmcb02->save.cpl;
-	vmcb12->control.int_state         = vmcb->control.int_state;
+	vmcb12->control.int_state         = vmcb02->control.int_state;
-	vmcb12->control.exit_code         = vmcb->control.exit_code;
+	vmcb12->control.exit_code         = vmcb02->control.exit_code;
-	vmcb12->control.exit_code_hi      = vmcb->control.exit_code_hi;
+	vmcb12->control.exit_code_hi      = vmcb02->control.exit_code_hi;
-	vmcb12->control.exit_info_1       = vmcb->control.exit_info_1;
+	vmcb12->control.exit_info_1       = vmcb02->control.exit_info_1;
-	vmcb12->control.exit_info_2       = vmcb->control.exit_info_2;
+	vmcb12->control.exit_info_2       = vmcb02->control.exit_info_2;
 	if (vmcb12->control.exit_code != SVM_EXIT_ERR)
 		nested_save_pending_event_to_vmcb12(svm, vmcb12);
 	if (svm->nrips_enabled)
-		vmcb12->control.next_rip  = vmcb->control.next_rip;
+		vmcb12->control.next_rip  = vmcb02->control.next_rip;
 	vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
 	vmcb12->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
 	vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
 	vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
 	if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
 		vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
 	nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
 	svm_switch_vmcb(svm, &svm->vmcb01);
 	if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
 		svm_copy_lbrs(vmcb12, vmcb02);
 		svm_update_lbrv(vcpu);
 	} else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
 		svm_copy_lbrs(vmcb01, vmcb02);
 		svm_update_lbrv(vcpu);
 	}
 	/*
 	 * On vmexit the  GIF is set to false and
 	 * no event can be injected in L1.
 	 */
 	svm_set_gif(svm, false);
-	svm->vmcb->control.exit_int_info = 0;
+	vmcb01->control.exit_int_info = 0;
 	svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
-	if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+	if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
-		svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+		vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
-		vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+		vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
 	}
 	if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
@ -907,13 +994,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	/*
 	 * Restore processor state that had been saved in vmcb01
 	 */
-	kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
+	kvm_set_rflags(vcpu, vmcb01->save.rflags);
-	svm_set_efer(vcpu, svm->vmcb->save.efer);
+	svm_set_efer(vcpu, vmcb01->save.efer);
-	svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
+	svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
-	svm_set_cr4(vcpu, svm->vmcb->save.cr4);
+	svm_set_cr4(vcpu, vmcb01->save.cr4);
-	kvm_rax_write(vcpu, svm->vmcb->save.rax);
+	kvm_rax_write(vcpu, vmcb01->save.rax);
-	kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
+	kvm_rsp_write(vcpu, vmcb01->save.rsp);
-	kvm_rip_write(vcpu, svm->vmcb->save.rip);
+	kvm_rip_write(vcpu, vmcb01->save.rip);
 	svm->vcpu.arch.dr7 = DR7_FIXED_1;
 	kvm_update_dr7(&svm->vcpu);
@ -931,7 +1018,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_svm_uninit_mmu_context(vcpu);
-	rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true);
+	rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
 	if (rc)
 		return 1;
@ -949,9 +1036,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
 	 * right now so that it an be accounted for before we execute
 	 * L1's next instruction.
 	 */
-	if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+	if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
 		kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
 	/*
 	 * Un-inhibit the AVIC right away, so that other vCPUs can start
 	 * to benefit from it right away.
 	 */
 	if (kvm_apicv_activated(vcpu->kvm))
 		kvm_vcpu_update_apicv(vcpu);
 	return 0;
 }
@ -1162,12 +1256,13 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
 static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
 {
 	unsigned int nr = svm->vcpu.arch.exception.nr;
 	struct vmcb *vmcb = svm->vmcb;
-	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+	vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
-	svm->vmcb->control.exit_code_hi = 0;
+	vmcb->control.exit_code_hi = 0;
 	if (svm->vcpu.arch.exception.has_error_code)
-		svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
+		vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
 	/*
 	 * EXITINFO2 is undefined for all exception intercepts other
@ -1175,11 +1270,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
 	 */
 	if (nr == PF_VECTOR) {
 		if (svm->vcpu.arch.exception.nested_apf)
-			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+			vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
 		else if (svm->vcpu.arch.exception.has_payload)
-			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+			vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
 		else
-			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+			vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
 	} else if (nr == DB_VECTOR) {
 		/* See inject_pending_event.  */
 		kvm_deliver_exception_payload(&svm->vcpu);
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@ -62,8 +62,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 static bool erratum_383_found __read_mostly;
 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
@ -172,7 +170,7 @@ static int vls = true;
 module_param(vls, int, 0444);
 /* enable/disable Virtual GIF */
-static int vgif = true;
+int vgif = true;
 module_param(vgif, int, 0444);
 /* enable/disable LBR virtualization */
@ -189,6 +187,9 @@ module_param(tsc_scaling, int, 0444);
 static bool avic;
 module_param(avic, bool, 0444);
 static bool force_avic;
 module_param_unsafe(force_avic, bool, 0444);
 bool __read_mostly dump_invalid_vmcb;
 module_param(dump_invalid_vmcb, bool, 0644);
@ -790,6 +791,17 @@ static void init_msrpm_offsets(void)
 	}
 }
 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
 {
 	to_vmcb->save.dbgctl		= from_vmcb->save.dbgctl;
 	to_vmcb->save.br_from		= from_vmcb->save.br_from;
 	to_vmcb->save.br_to		= from_vmcb->save.br_to;
 	to_vmcb->save.last_excp_from	= from_vmcb->save.last_excp_from;
 	to_vmcb->save.last_excp_to	= from_vmcb->save.last_excp_to;
 	vmcb_mark_dirty(to_vmcb, VMCB_LBR);
 }
 static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@ -799,6 +811,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
 	/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
 	if (is_guest_mode(vcpu))
 		svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
 }
 static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
@ -810,6 +826,67 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
 	set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
 	/*
 	 * Move the LBR msrs back to the vmcb01 to avoid copying them
 	 * on nested guest entries.
 	 */
 	if (is_guest_mode(vcpu))
 		svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
 }
 static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
 {
 	/*
 	 * If the LBR virtualization is disabled, the LBR msrs are always
 	 * kept in the vmcb01 to avoid copying them on nested guest entries.
 	 *
 	 * If nested, and the LBR virtualization is enabled/disabled, the msrs
 	 * are moved between the vmcb01 and vmcb02 as needed.
 	 */
 	struct vmcb *vmcb =
 		(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
 			svm->vmcb : svm->vmcb01.ptr;
 	switch (index) {
 	case MSR_IA32_DEBUGCTLMSR:
 		return vmcb->save.dbgctl;
 	case MSR_IA32_LASTBRANCHFROMIP:
 		return vmcb->save.br_from;
 	case MSR_IA32_LASTBRANCHTOIP:
 		return vmcb->save.br_to;
 	case MSR_IA32_LASTINTFROMIP:
 		return vmcb->save.last_excp_from;
 	case MSR_IA32_LASTINTTOIP:
 		return vmcb->save.last_excp_to;
 	default:
 		KVM_BUG(false, svm->vcpu.kvm,
 			"%s: Unknown MSR 0x%x", __func__, index);
 		return 0;
 	}
 }
 void svm_update_lbrv(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
 					   DEBUGCTLMSR_LBR;
 	bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
 				      LBR_CTL_ENABLE_MASK);
 	if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
 		if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
 			enable_lbrv = true;
 	if (enable_lbrv == current_enable_lbrv)
 		return;
 	if (enable_lbrv)
 		svm_enable_lbrv(vcpu);
 	else
 		svm_disable_lbrv(vcpu);
 }
 void disable_nmi_singlestep(struct vcpu_svm *svm)
@ -831,6 +908,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	int old = control->pause_filter_count;
 	if (kvm_pause_in_guest(vcpu->kvm) || !old)
 		return;
 	control->pause_filter_count = __grow_ple_window(old,
 							pause_filter_count,
 							pause_filter_count_grow,
@ -849,6 +929,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	int old = control->pause_filter_count;
 	if (kvm_pause_in_guest(vcpu->kvm) || !old)
 		return;
 	control->pause_filter_count =
 				__shrink_ple_window(old,
 						    pause_filter_count,
@ -960,6 +1043,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
 		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
 		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
 		svm->v_vmload_vmsave_enabled = false;
 	} else {
 		/*
 		 * If hardware supports Virtual VMLOAD VMSAVE then enable it
@ -979,8 +1064,9 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
 static void init_vmcb(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb_control_area *control = &svm->vmcb->control;
+	struct vmcb *vmcb = svm->vmcb01.ptr;
-	struct vmcb_save_area *save = &svm->vmcb->save;
+	struct vmcb_control_area *control = &vmcb->control;
 	struct vmcb_save_area *save = &vmcb->save;
 	svm_set_intercept(svm, INTERCEPT_CR0_READ);
 	svm_set_intercept(svm, INTERCEPT_CR3_READ);
@ -1104,7 +1190,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
 	if (kvm_vcpu_apicv_active(vcpu))
-		avic_init_vmcb(svm);
+		avic_init_vmcb(svm, vmcb);
 	if (vgif) {
 		svm_clr_intercept(svm, INTERCEPT_STGI);
@ -1122,10 +1208,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
 		}
 	}
-	svm_hv_init_vmcb(svm->vmcb);
+	svm_hv_init_vmcb(vmcb);
 	init_vmcb_after_set_cpuid(vcpu);
-	vmcb_mark_all_dirty(svm->vmcb);
+	vmcb_mark_all_dirty(vmcb);
 	enable_gif(svm);
 }
@ -1380,7 +1466,7 @@ static void svm_set_vintr(struct vcpu_svm *svm)
 	/*
 	 * The following fields are ignored when AVIC is enabled
 	 */
-	WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+	WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
 	svm_set_intercept(svm, INTERCEPT_VINTR);
@ -2142,7 +2228,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
 		 * Likewise, clear the VINTR intercept, we will set it
 		 * again while processing KVM_REQ_EVENT if needed.
 		 */
-		if (vgif_enabled(svm))
+		if (vgif)
 			svm_clr_intercept(svm, INTERCEPT_STGI);
 		if (svm_is_intercept(svm, INTERCEPT_VINTR))
 			svm_clear_vintr(svm);
@ -2160,7 +2246,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
 		 * in use, we still rely on the VINTR intercept (rather than
 		 * STGI) to detect an open interrupt window.
 		*/
-		if (!vgif_enabled(svm))
+		if (!vgif)
 			svm_clear_vintr(svm);
 	}
 }
@ -2575,25 +2661,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_TSC_AUX:
 		msr_info->data = svm->tsc_aux;
 		break;
 	/*
 	 * Nobody will change the following 5 values in the VMCB so we can
 	 * safely return them on rdmsr. They will always be 0 until LBRV is
 	 * implemented.
 	 */
 	case MSR_IA32_DEBUGCTLMSR:
 		msr_info->data = svm->vmcb->save.dbgctl;
 		break;
 	case MSR_IA32_LASTBRANCHFROMIP:
 		msr_info->data = svm->vmcb->save.br_from;
 		break;
 	case MSR_IA32_LASTBRANCHTOIP:
 		msr_info->data = svm->vmcb->save.br_to;
 		break;
 	case MSR_IA32_LASTINTFROMIP:
 		msr_info->data = svm->vmcb->save.last_excp_from;
 		break;
 	case MSR_IA32_LASTINTTOIP:
-		msr_info->data = svm->vmcb->save.last_excp_to;
+		msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
 		break;
 	case MSR_VM_HSAVE_PA:
 		msr_info->data = svm->nested.hsave_msr;
@ -2839,12 +2912,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 		if (data & DEBUGCTL_RESERVED_BITS)
 			return 1;
-		svm->vmcb->save.dbgctl = data;
+		if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
-		vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
+			svm->vmcb->save.dbgctl = data;
 		if (data & (1ULL<<0))
 			svm_enable_lbrv(vcpu);
 		else
-			svm_disable_lbrv(vcpu);
+			svm->vmcb01.ptr->save.dbgctl = data;
 		svm_update_lbrv(vcpu);
 		break;
 	case MSR_VM_HSAVE_PA:
 		/*
@ -2901,9 +2975,16 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
 	svm_clear_vintr(to_svm(vcpu));
 	/*
-	 * For AVIC, the only reason to end up here is ExtINTs.
+	 * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
 	 * In this case AVIC was temporarily disabled for
 	 * requesting the IRQ window and we have to re-enable it.
 	 *
 	 * If running nested, still remove the VM wide AVIC inhibit to
 	 * support case in which the interrupt window was requested when the
 	 * vCPU was not running nested.
 	 * All vCPUs which run still run nested, will remain to have their
 	 * AVIC still inhibited due to per-cpu AVIC inhibition.
 	 */
 	kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
@ -2914,7 +2995,6 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
 static int pause_interception(struct kvm_vcpu *vcpu)
 {
 	bool in_kernel;
 	/*
 	 * CPL is not made available for an SEV-ES guest, therefore
 	 * vcpu->arch.preempted_in_kernel can never be true.  Just
@ -2922,8 +3002,7 @@ static int pause_interception(struct kvm_vcpu *vcpu)
 	 */
 	in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
-	if (!kvm_pause_in_guest(vcpu->kvm))
+	grow_ple_window(vcpu);
 		grow_ple_window(vcpu);
 	kvm_vcpu_on_spin(vcpu, in_kernel);
 	return kvm_skip_emulated_instruction(vcpu);
@ -3496,14 +3575,20 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
 	 * enabled, the STGI interception will not occur. Enable the irq
 	 * window under the assumption that the hardware will set the GIF.
 	 */
-	if (vgif_enabled(svm) || gif_set(svm)) {
+	if (vgif || gif_set(svm)) {
 		/*
 		 * IRQ window is not needed when AVIC is enabled,
 		 * unless we have pending ExtINT since it cannot be injected
-		 * via AVIC. In such case, we need to temporarily disable AVIC,
+		 * via AVIC. In such case, KVM needs to temporarily disable AVIC,
 		 * and fallback to injecting IRQ via V_IRQ.
 		 *
 		 * If running nested, AVIC is already locally inhibited
 		 * on this vCPU, therefore there is no need to request
 		 * the VM wide AVIC inhibition.
 		 */
-		kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+		if (!is_guest_mode(vcpu))
 			kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
 		svm_set_vintr(svm);
 	}
 }
@ -3516,7 +3601,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
 		return; /* IRET will cause a vm exit */
 	if (!gif_set(svm)) {
-		if (vgif_enabled(svm))
+		if (vgif)
 			svm_set_intercept(svm, INTERCEPT_STGI);
 		return; /* STGI will cause a vm exit */
 	}
@ -3946,6 +4031,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 			     guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
 	svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
 	svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
 	svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
 	svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
 			guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
 	svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
 			guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
 	svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
 	svm_recalc_instruction_intercepts(vcpu, svm);
@ -3963,13 +4059,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 		 */
 		if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
 			kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
 		/*
 		 * Currently, AVIC does not work with nested virtualization.
 		 * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
 		 */
 		if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
 			kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED);
 	}
 	init_vmcb_after_set_cpuid(vcpu);
 }
@ -4224,7 +4313,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
 	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
 	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-	ret = nested_svm_vmexit(svm);
+	ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
 	if (ret)
 		return ret;
@ -4321,7 +4410,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	if (!gif_set(svm)) {
-		if (vgif_enabled(svm))
+		if (vgif)
 			svm_set_intercept(svm, INTERCEPT_STGI);
 		/* STGI will cause a vm exit */
 	} else {
@ -4632,6 +4721,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
 	.complete_emulated_msr = svm_complete_emulated_msr,
 	.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
 	.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
 };
 /*
@ -4695,6 +4785,20 @@ static __init void svm_set_cpu_caps(void)
 		if (tsc_scaling)
 			kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
 		if (vls)
 			kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
 		if (lbrv)
 			kvm_cpu_cap_set(X86_FEATURE_LBRV);
 		if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
 			kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
 		if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
 			kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
 		if (vgif)
 			kvm_cpu_cap_set(X86_FEATURE_VGIF);
 		/* Nested VM can receive #VMEXIT instead of triggering #GP */
 		kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
 	}
@ -4806,15 +4910,20 @@ static __init int svm_hardware_setup(void)
 			nrips = false;
 	}
-	enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+	enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
 	if (enable_apicv) {
-		pr_info("AVIC enabled\n");
+		if (!boot_cpu_has(X86_FEATURE_AVIC)) {
 			pr_warn("AVIC is not supported in CPUID but force enabled");
 			pr_warn("Your system might crash and burn");
 		} else
 			pr_info("AVIC enabled\n");
 		amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
 	} else {
 		svm_x86_ops.vcpu_blocking = NULL;
 		svm_x86_ops.vcpu_unblocking = NULL;
 		svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
 	}
 	if (vls) {
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@ -33,6 +33,7 @@
 #define MSRPM_OFFSETS	16
 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
 extern bool npt_enabled;
 extern int vgif;
 extern bool intercept_smi;
 /*
@ -231,9 +232,14 @@ struct vcpu_svm {
 	unsigned int3_injected;
 	unsigned long int3_rip;
-	/* cached guest cpuid flags for faster access */
+	/* optional nested SVM features that are enabled for this guest  */
 	bool nrips_enabled                : 1;
 	bool tsc_scaling_enabled          : 1;
 	bool v_vmload_vmsave_enabled      : 1;
 	bool lbrv_enabled                 : 1;
 	bool pause_filter_enabled         : 1;
 	bool pause_threshold_enabled      : 1;
 	bool vgif_enabled                 : 1;
 	u32 ldr_reg;
 	u32 dfr_reg;
@ -452,44 +458,70 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
 	return vmcb_is_intercept(&svm->vmcb->control, bit);
 }
-static inline bool vgif_enabled(struct vcpu_svm *svm)
+static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
 {
-	return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+	return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
 }
 static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
 {
 	if (!vgif)
 		return NULL;
 	if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
 		return svm->nested.vmcb02.ptr;
 	else
 		return svm->vmcb01.ptr;
 }
 static inline void enable_gif(struct vcpu_svm *svm)
 {
-	if (vgif_enabled(svm))
+	struct vmcb *vmcb = get_vgif_vmcb(svm);
-		svm->vmcb->control.int_ctl |= V_GIF_MASK;
+
 	if (vmcb)
 		vmcb->control.int_ctl |= V_GIF_MASK;
 	else
 		svm->vcpu.arch.hflags |= HF_GIF_MASK;
 }
 static inline void disable_gif(struct vcpu_svm *svm)
 {
-	if (vgif_enabled(svm))
+	struct vmcb *vmcb = get_vgif_vmcb(svm);
-		svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+
 	if (vmcb)
 		vmcb->control.int_ctl &= ~V_GIF_MASK;
 	else
 		svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
 }
 static inline bool gif_set(struct vcpu_svm *svm)
 {
-	if (vgif_enabled(svm))
+	struct vmcb *vmcb = get_vgif_vmcb(svm);
-		return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+
 	if (vmcb)
 		return !!(vmcb->control.int_ctl & V_GIF_MASK);
 	else
 		return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 static inline bool nested_npt_enabled(struct vcpu_svm *svm)
 {
 	return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
 }
 /* svm.c */
 #define MSR_INVALID				0xffffffffU
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 extern bool dump_invalid_vmcb;
 u32 svm_msrpm_offset(u32 msr);
 u32 *svm_vcpu_alloc_msrpm(void);
 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
 void svm_vcpu_free_msrpm(u32 *msrpm);
 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
 void svm_update_lbrv(struct kvm_vcpu *vcpu);
 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@ -574,7 +606,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
 int avic_ga_log_notifier(u32 ga_tag);
 void avic_vm_destroy(struct kvm *kvm);
 int avic_vm_init(struct kvm *kvm);
-void avic_init_vmcb(struct vcpu_svm *svm);
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
 int avic_init_vcpu(struct vcpu_svm *svm);
@ -592,6 +624,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
 void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
 void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void avic_ring_doorbell(struct kvm_vcpu *vcpu);
 unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
 /* sev.c */
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@ -4380,7 +4380,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
 	if (cpu_has_secondary_exec_ctrls())
 		secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
-	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+	if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -961,11 +961,13 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
 			wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
 	}
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	if (static_cpu_has(X86_FEATURE_PKU) &&
-	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+	    vcpu->arch.pkru != vcpu->arch.host_pkru &&
-	     (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
+	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
-	    vcpu->arch.pkru != vcpu->arch.host_pkru)
+	     kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
 		write_pkru(vcpu->arch.pkru);
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 }
 EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
@ -974,13 +976,15 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.guest_state_protected)
 		return;
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	if (static_cpu_has(X86_FEATURE_PKU) &&
-	    (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
-	     (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+	     kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
 		vcpu->arch.pkru = rdpkru();
 		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
 			write_pkru(vcpu->arch.host_pkru);
 	}
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
@ -2249,14 +2253,13 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
 	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 	/* we verify if the enable bit is set... */
-	vcpu->arch.pv_time_enabled = false;
+	if (system_time & 1) {
-	if (!(system_time & 1))
+		kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
-		return;
+					  KVM_HOST_USES_PFN, system_time & ~1ULL,
-
+					  sizeof(struct pvclock_vcpu_time_info));
-	if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
+	} else {
-				       &vcpu->arch.pv_time, system_time & ~1ULL,
+		kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
-				       sizeof(struct pvclock_vcpu_time_info)))
+	}
 		vcpu->arch.pv_time_enabled = true;
 	return;
 }
@ -2961,63 +2964,55 @@ u64 get_kvmclock_ns(struct kvm *kvm)
 	return data.clock;
 }
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
+static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
-				   struct gfn_to_hva_cache *cache,
+				    struct gfn_to_pfn_cache *gpc,
-				   unsigned int offset)
+				    unsigned int offset)
 {
 	struct kvm_vcpu_arch *vcpu = &v->arch;
-	struct pvclock_vcpu_time_info guest_hv_clock;
+	struct pvclock_vcpu_time_info *guest_hv_clock;
 	unsigned long flags;
-	if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
+	read_lock_irqsave(&gpc->lock, flags);
-		&guest_hv_clock, offset, sizeof(guest_hv_clock))))
+	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
-		return;
+					   offset + sizeof(*guest_hv_clock))) {
 		read_unlock_irqrestore(&gpc->lock, flags);
-	/* This VCPU is paused, but it's legal for a guest to read another
+		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
 						 offset + sizeof(*guest_hv_clock)))
 			return;
 		read_lock_irqsave(&gpc->lock, flags);
 	}
 	guest_hv_clock = (void *)(gpc->khva + offset);
 	/*
 	 * This VCPU is paused, but it's legal for a guest to read another
 	 * VCPU's kvmclock, so we really have to follow the specification where
 	 * it says that version is odd if data is being modified, and even after
 	 * it is consistent.
 	 *
 	 * Version field updates must be kept separate.  This is because
 	 * kvm_write_guest_cached might use a "rep movs" instruction, and
 	 * writes within a string instruction are weakly ordered.  So there
 	 * are three writes overall.
 	 *
 	 * As a small optimization, only write the version field in the first
 	 * and third write.  The vcpu->pv_time cache is still valid, because the
 	 * version field is the first in the struct.
 	 */
 	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 	if (guest_hv_clock.version & 1)
 		++guest_hv_clock.version;  /* first time write, random junk */
 	vcpu->hv_clock.version = guest_hv_clock.version + 1;
 	kvm_write_guest_offset_cached(v->kvm, cache,
 				      &vcpu->hv_clock, offset,
 				      sizeof(vcpu->hv_clock.version));
 	guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
 	smp_wmb();
 	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-	vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+	vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
 	if (vcpu->pvclock_set_guest_stopped_request) {
 		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
 		vcpu->pvclock_set_guest_stopped_request = false;
 	}
-	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+	memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
 	kvm_write_guest_offset_cached(v->kvm, cache,
 				      &vcpu->hv_clock, offset,
 				      sizeof(vcpu->hv_clock));
 	smp_wmb();
-	vcpu->hv_clock.version++;
+	guest_hv_clock->version = ++vcpu->hv_clock.version;
-	kvm_write_guest_offset_cached(v->kvm, cache,
+
-				     &vcpu->hv_clock, offset,
+	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
-				     sizeof(vcpu->hv_clock.version));
+	read_unlock_irqrestore(&gpc->lock, flags);
 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 }
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@ -3106,13 +3101,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	vcpu->hv_clock.flags = pvclock_flags;
-	if (vcpu->pv_time_enabled)
+	if (vcpu->pv_time.active)
-		kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
+		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
-	if (vcpu->xen.vcpu_info_set)
+	if (vcpu->xen.vcpu_info_cache.active)
-		kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
+		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
-				       offsetof(struct compat_vcpu_info, time));
+					offsetof(struct compat_vcpu_info, time));
-	if (vcpu->xen.vcpu_time_info_set)
+	if (vcpu->xen.vcpu_time_info_cache.active)
-		kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
+		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
 	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 	return 0;
 }
@ -3300,7 +3295,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
 static void kvmclock_reset(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.pv_time_enabled = false;
+	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
 	vcpu->arch.time = 0;
 }
@ -4284,7 +4279,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
 		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
 		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
-		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
+		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
 		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
 		if (sched_info_on())
 			r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
 		break;
@ -4331,6 +4327,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = boot_cpu_has(X86_FEATURE_XSAVE);
 		break;
 	case KVM_CAP_TSC_CONTROL:
 	case KVM_CAP_VM_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
 	case KVM_CAP_X2APIC_API:
@ -5102,7 +5099,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
 */
 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 {
-	if (!vcpu->arch.pv_time_enabled)
+	if (!vcpu->arch.pv_time.active)
 		return -EINVAL;
 	vcpu->arch.pvclock_set_guest_stopped_request = true;
 	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@ -6186,7 +6183,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
 	mutex_lock(&kvm->lock);
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (!vcpu->arch.pv_time_enabled)
+		if (!vcpu->arch.pv_time.active)
 			continue;
 		ret = kvm_set_guest_paused(vcpu);
@ -6513,6 +6510,15 @@ set_pit2_out:
 		r = kvm_xen_hvm_set_attr(kvm, &xha);
 		break;
 	}
 	case KVM_XEN_HVM_EVTCHN_SEND: {
 		struct kvm_irq_routing_xen_evtchn uxe;
 		r = -EFAULT;
 		if (copy_from_user(&uxe, argp, sizeof(uxe)))
 			goto out;
 		r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
 		break;
 	}
 #endif
 	case KVM_SET_CLOCK:
 		r = kvm_vm_ioctl_set_clock(kvm, argp);
@ -6520,6 +6526,28 @@ set_pit2_out:
 	case KVM_GET_CLOCK:
 		r = kvm_vm_ioctl_get_clock(kvm, argp);
 		break;
 	case KVM_SET_TSC_KHZ: {
 		u32 user_tsc_khz;
 		r = -EINVAL;
 		user_tsc_khz = (u32)arg;
 		if (kvm_has_tsc_control &&
 		    user_tsc_khz >= kvm_max_guest_tsc_khz)
 			goto out;
 		if (user_tsc_khz == 0)
 			user_tsc_khz = tsc_khz;
 		WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
 		r = 0;
 		goto out;
 	}
 	case KVM_GET_TSC_KHZ: {
 		r = READ_ONCE(kvm->arch.default_tsc_khz);
 		goto out;
 	}
 	case KVM_MEMORY_ENCRYPT_OP: {
 		r = -ENOTTY;
 		if (!kvm_x86_ops.mem_enc_ioctl)
@ -8789,22 +8817,22 @@ static int kvmclock_cpu_online(unsigned int cpu)
 static void kvm_timer_init(void)
 {
 	max_tsc_khz = tsc_khz;
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
+		max_tsc_khz = tsc_khz;
 		struct cpufreq_policy *policy;
 		int cpu;
-		cpu = get_cpu();
+		if (IS_ENABLED(CONFIG_CPU_FREQ)) {
-		policy = cpufreq_cpu_get(cpu);
+			struct cpufreq_policy *policy;
-		if (policy) {
+			int cpu;
-			if (policy->cpuinfo.max_freq)
+
-				max_tsc_khz = policy->cpuinfo.max_freq;
+			cpu = get_cpu();
-			cpufreq_cpu_put(policy);
+			policy = cpufreq_cpu_get(cpu);
 			if (policy) {
 				if (policy->cpuinfo.max_freq)
 					max_tsc_khz = policy->cpuinfo.max_freq;
 				cpufreq_cpu_put(policy);
 			}
 			put_cpu();
 		}
 		put_cpu();
 #endif
 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
 					  CPUFREQ_TRANSITION_NOTIFIER);
 	}
@ -9089,6 +9117,14 @@ bool kvm_apicv_activated(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_apicv_activated);
 bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
 {
 	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
 	ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
 	return (vm_reasons | vcpu_reasons) == 0;
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
 static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
 				       enum kvm_apicv_inhibit reason, bool set)
@ -9266,6 +9302,17 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 	char instruction[3];
 	unsigned long rip = kvm_rip_read(vcpu);
 	/*
 	 * If the quirk is disabled, synthesize a #UD and let the guest pick up
 	 * the pieces.
 	 */
 	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
 		ctxt->exception.error_code_valid = false;
 		ctxt->exception.vector = UD_VECTOR;
 		ctxt->have_exception = true;
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
 	return emulator_write_emulated(ctxt, rip, instruction, 3,
@ -9763,7 +9810,8 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
 	down_read(&vcpu->kvm->arch.apicv_update_lock);
-	activate = kvm_apicv_activated(vcpu->kvm);
+	activate = kvm_vcpu_apicv_activated(vcpu);
 	if (vcpu->arch.apicv_active == activate)
 		goto out;
@ -10164,7 +10212,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		 * per-VM state, and responsing vCPUs must wait for the update
 		 * to complete before servicing KVM_REQ_APICV_UPDATE.
 		 */
-		WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+		WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
 		exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
 		if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
@ -10362,6 +10410,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			break;
 		kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
 		if (kvm_xen_has_pending_events(vcpu))
 			kvm_xen_inject_pending_events(vcpu);
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
@ -11247,9 +11298,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
 	kvm_xen_init_vcpu(vcpu);
 	kvm_vcpu_mtrr_init(vcpu);
 	vcpu_load(vcpu);
-	kvm_set_tsc_khz(vcpu, max_tsc_khz);
+	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
 	kvm_vcpu_reset(vcpu, false);
 	kvm_init_mmu(vcpu);
 	vcpu_put(vcpu);
@ -11304,6 +11356,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
 	kvm_xen_destroy_vcpu(vcpu);
 	kvm_hv_vcpu_uninit(vcpu);
 	kvm_pmu_destroy(vcpu);
 	kfree(vcpu->arch.mce_banks);
@ -11696,6 +11749,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	pvclock_update_vm_gtod_copy(kvm);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 	kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
 	kvm->arch.guest_can_read_msr_platform_info = true;
 	kvm->arch.enable_pmu = enable_pmu;
@ -12173,6 +12227,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
 		return true;
 	if (kvm_xen_has_pending_events(vcpu))
 		return true;
 	return false;
 }
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@ -15,16 +15,19 @@
 extern struct static_key_false_deferred kvm_xen_enabled;
 int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
 void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
 int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
 int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
 int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
 int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
 int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt);
 int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
 int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
 void kvm_xen_init_vm(struct kvm *kvm);
 void kvm_xen_destroy_vm(struct kvm *kvm);
-
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu);
-int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu);
 int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
 			    struct kvm *kvm);
 int kvm_xen_setup_evtchn(struct kvm *kvm,
 			 struct kvm_kernel_irq_routing_entry *e,
@ -46,11 +49,33 @@ static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
 static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	if (static_branch_unlikely(&kvm_xen_enabled.key) &&
-	    vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+	    vcpu->arch.xen.vcpu_info_cache.active &&
 	    vcpu->kvm->arch.xen.upcall_vector)
 		return __kvm_xen_has_interrupt(vcpu);
 	return 0;
 }
 static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
 {
 	return static_branch_unlikely(&kvm_xen_enabled.key) &&
 		vcpu->arch.xen.evtchn_pending_sel;
 }
 static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
 {
 	return !!vcpu->arch.xen.timer_virq;
 }
 static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
 		return atomic_read(&vcpu->arch.xen.timer_pending);
 	return 0;
 }
 void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
 #else
 static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 {
@ -65,6 +90,14 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm)
 {
 }
 static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
 {
 }
 static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 {
 }
 static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
 {
 	return false;
@ -79,6 +112,29 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
 static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu)
 {
 }
 static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
 {
 	return false;
 }
 static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
 static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
 {
 }
 static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
 {
 	return false;
 }
 #endif
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@ -611,7 +611,8 @@ struct kvm_hv_sint {
 struct kvm_xen_evtchn {
 	u32 port;
-	u32 vcpu;
+	u32 vcpu_id;
 	int vcpu_idx;
 	u32 priority;
 };
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@ -1144,6 +1144,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_S390_MEM_OP_EXTENSION 211
 #define KVM_CAP_PMU_CAPABILITY 212
 #define KVM_CAP_DISABLE_QUIRKS2 213
 #define KVM_CAP_VM_TSC_CONTROL 214
 #ifdef KVM_CAP_IRQ_ROUTING
@ -1232,6 +1233,7 @@ struct kvm_x86_mce {
 #define KVM_XEN_HVM_CONFIG_SHARED_INFO		(1 << 2)
 #define KVM_XEN_HVM_CONFIG_RUNSTATE		(1 << 3)
 #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL	(1 << 4)
 #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND		(1 << 5)
 struct kvm_xen_hvm_config {
 	__u32 flags;
@ -1470,7 +1472,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 /* Available with KVM_CAP_PPC_GET_PVINFO */
 #define KVM_PPC_GET_PVINFO	  _IOW(KVMIO,  0xa1, struct kvm_ppc_pvinfo)
-/* Available with KVM_CAP_TSC_CONTROL */
+/* Available with KVM_CAP_TSC_CONTROL for a vCPU, or with
 *  KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */
 #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
 #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
 /* Available with KVM_CAP_PCI_2_3 */
@ -1686,6 +1689,32 @@ struct kvm_xen_hvm_attr {
 		struct {
 			__u64 gfn;
 		} shared_info;
 		struct {
 			__u32 send_port;
 			__u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
 			__u32 flags;
 #define KVM_XEN_EVTCHN_DEASSIGN		(1 << 0)
 #define KVM_XEN_EVTCHN_UPDATE		(1 << 1)
 #define KVM_XEN_EVTCHN_RESET		(1 << 2)
 			/*
 			 * Events sent by the guest are either looped back to
 			 * the guest itself (potentially on a different port#)
 			 * or signalled via an eventfd.
 			 */
 			union {
 				struct {
 					__u32 port;
 					__u32 vcpu;
 					__u32 priority;
 				} port;
 				struct {
 					__u32 port; /* Zero for eventfd */
 					__s32 fd;
 				} eventfd;
 				__u32 padding[4];
 			} deliver;
 		} evtchn;
 		__u32 xen_version;
 		__u64 pad[8];
 	} u;
 };
@ -1694,11 +1723,17 @@ struct kvm_xen_hvm_attr {
 #define KVM_XEN_ATTR_TYPE_LONG_MODE		0x0
 #define KVM_XEN_ATTR_TYPE_SHARED_INFO		0x1
 #define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR		0x2
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 #define KVM_XEN_ATTR_TYPE_EVTCHN		0x3
 #define KVM_XEN_ATTR_TYPE_XEN_VERSION		0x4
 /* Per-vCPU Xen attributes */
 #define KVM_XEN_VCPU_GET_ATTR	_IOWR(KVMIO, 0xca, struct kvm_xen_vcpu_attr)
 #define KVM_XEN_VCPU_SET_ATTR	_IOW(KVMIO,  0xcb, struct kvm_xen_vcpu_attr)
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 #define KVM_XEN_HVM_EVTCHN_SEND	_IOW(KVMIO,  0xd0, struct kvm_irq_routing_xen_evtchn)
 #define KVM_GET_SREGS2             _IOR(KVMIO,  0xcc, struct kvm_sregs2)
 #define KVM_SET_SREGS2             _IOW(KVMIO,  0xcd, struct kvm_sregs2)
@ -1716,6 +1751,13 @@ struct kvm_xen_vcpu_attr {
 			__u64 time_blocked;
 			__u64 time_offline;
 		} runstate;
 		__u32 vcpu_id;
 		struct {
 			__u32 port;
 			__u32 priority;
 			__u64 expires_ns;
 		} timer;
 		__u8 vector;
 	} u;
 };
@ -1726,6 +1768,10 @@ struct kvm_xen_vcpu_attr {
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT	0x3
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA	0x4
 #define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST	0x5
 /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID		0x6
 #define KVM_XEN_VCPU_ATTR_TYPE_TIMER		0x7
 #define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR	0x8
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@ -16,6 +16,7 @@
 /x86_64/debug_regs
 /x86_64/evmcs_test
 /x86_64/emulator_error_test
 /x86_64/fix_hypercall_test
 /x86_64/get_msr_index_features
 /x86_64/kvm_clock_test
 /x86_64/kvm_pv_test
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@ -48,6 +48,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
 TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test
 TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
 TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features
@ -65,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/state_test
 TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
 TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
 TEST_GEN_PROGS_x86_64 += x86_64/tsc_scaling_sync
 TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
 TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
 TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
--- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
+++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
@ -0,0 +1,170 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
 * Copyright (C) 2020, Google LLC.
 *
 * Tests for KVM paravirtual feature disablement
 */
 #include <asm/kvm_para.h>
 #include <linux/kvm_para.h>
 #include <linux/stringify.h>
 #include <stdint.h>
 #include "apic.h"
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
 #define VCPU_ID 0
 static bool ud_expected;
 static void guest_ud_handler(struct ex_regs *regs)
 {
 	GUEST_ASSERT(ud_expected);
 	GUEST_DONE();
 }
 extern unsigned char svm_hypercall_insn;
 static uint64_t svm_do_sched_yield(uint8_t apic_id)
 {
 	uint64_t ret;
 	asm volatile("mov %1, %%rax\n\t"
 		     "mov %2, %%rbx\n\t"
 		     "svm_hypercall_insn:\n\t"
 		     "vmmcall\n\t"
 		     "mov %%rax, %0\n\t"
 		     : "=r"(ret)
 		     : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id)
 		     : "rax", "rbx", "memory");
 	return ret;
 }
 extern unsigned char vmx_hypercall_insn;
 static uint64_t vmx_do_sched_yield(uint8_t apic_id)
 {
 	uint64_t ret;
 	asm volatile("mov %1, %%rax\n\t"
 		     "mov %2, %%rbx\n\t"
 		     "vmx_hypercall_insn:\n\t"
 		     "vmcall\n\t"
 		     "mov %%rax, %0\n\t"
 		     : "=r"(ret)
 		     : "r"((uint64_t)KVM_HC_SCHED_YIELD), "r"((uint64_t)apic_id)
 		     : "rax", "rbx", "memory");
 	return ret;
 }
 static void assert_hypercall_insn(unsigned char *exp_insn, unsigned char *obs_insn)
 {
 	uint32_t exp = 0, obs = 0;
 	memcpy(&exp, exp_insn, sizeof(exp));
 	memcpy(&obs, obs_insn, sizeof(obs));
 	GUEST_ASSERT_EQ(exp, obs);
 }
 static void guest_main(void)
 {
 	unsigned char *native_hypercall_insn, *hypercall_insn;
 	uint8_t apic_id;
 	apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
 	if (is_intel_cpu()) {
 		native_hypercall_insn = &vmx_hypercall_insn;
 		hypercall_insn = &svm_hypercall_insn;
 		svm_do_sched_yield(apic_id);
 	} else if (is_amd_cpu()) {
 		native_hypercall_insn = &svm_hypercall_insn;
 		hypercall_insn = &vmx_hypercall_insn;
 		vmx_do_sched_yield(apic_id);
 	} else {
 		GUEST_ASSERT(0);
 		/* unreachable */
 		return;
 	}
 	GUEST_ASSERT(!ud_expected);
 	assert_hypercall_insn(native_hypercall_insn, hypercall_insn);
 	GUEST_DONE();
 }
 static void setup_ud_vector(struct kvm_vm *vm)
 {
 	vm_init_descriptor_tables(vm);
 	vcpu_init_descriptor_tables(vm, VCPU_ID);
 	vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
 }
 static void enter_guest(struct kvm_vm *vm)
 {
 	struct kvm_run *run;
 	struct ucall uc;
 	run = vcpu_state(vm, VCPU_ID);
 	vcpu_run(vm, VCPU_ID);
 	switch (get_ucall(vm, VCPU_ID, &uc)) {
 	case UCALL_SYNC:
 		pr_info("%s: %016lx\n", (const char *)uc.args[2], uc.args[3]);
 		break;
 	case UCALL_DONE:
 		return;
 	case UCALL_ABORT:
 		TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, uc.args[1]);
 	default:
 		TEST_FAIL("Unhandled ucall: %ld\nexit_reason: %u (%s)",
 			  uc.cmd, run->exit_reason, exit_reason_str(run->exit_reason));
 	}
 }
 static void test_fix_hypercall(void)
 {
 	struct kvm_vm *vm;
 	vm = vm_create_default(VCPU_ID, 0, guest_main);
 	setup_ud_vector(vm);
 	ud_expected = false;
 	sync_global_to_guest(vm, ud_expected);
 	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
 	enter_guest(vm);
 }
 static void test_fix_hypercall_disabled(void)
 {
 	struct kvm_enable_cap cap = {0};
 	struct kvm_vm *vm;
 	vm = vm_create_default(VCPU_ID, 0, guest_main);
 	setup_ud_vector(vm);
 	cap.cap = KVM_CAP_DISABLE_QUIRKS2;
 	cap.args[0] = KVM_X86_QUIRK_FIX_HYPERCALL_INSN;
 	vm_enable_cap(vm, &cap);
 	ud_expected = true;
 	sync_global_to_guest(vm, ud_expected);
 	virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
 	enter_guest(vm);
 }
 int main(void)
 {
 	if (!(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
 		print_skip("KVM_X86_QUIRK_HYPERCALL_INSN not supported");
 		exit(KSFT_SKIP);
 	}
 	test_fix_hypercall();
 	test_fix_hypercall_disabled();
 }
--- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
+++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c
@ -0,0 +1,119 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
 * svm_vmcall_test
 *
 * Copyright © 2021 Amazon.com, Inc. or its affiliates.
 *
 * Xen shared_info / pvclock testing
 */
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
 #include <stdint.h>
 #include <time.h>
 #include <sched.h>
 #include <signal.h>
 #include <pthread.h>
 #define NR_TEST_VCPUS 20
 static struct kvm_vm *vm;
 pthread_spinlock_t create_lock;
 #define TEST_TSC_KHZ    2345678UL
 #define TEST_TSC_OFFSET 200000000
 uint64_t tsc_sync;
 static void guest_code(void)
 {
 	uint64_t start_tsc, local_tsc, tmp;
 	start_tsc = rdtsc();
 	do {
 		tmp = READ_ONCE(tsc_sync);
 		local_tsc = rdtsc();
 		WRITE_ONCE(tsc_sync, local_tsc);
 		if (unlikely(local_tsc < tmp))
 			GUEST_SYNC_ARGS(0, local_tsc, tmp, 0, 0);
 	} while (local_tsc - start_tsc < 5000 * TEST_TSC_KHZ);
 	GUEST_DONE();
 }
 static void *run_vcpu(void *_cpu_nr)
 {
 	unsigned long cpu = (unsigned long)_cpu_nr;
 	unsigned long failures = 0;
 	static bool first_cpu_done;
 	/* The kernel is fine, but vm_vcpu_add_default() needs locking */
 	pthread_spin_lock(&create_lock);
 	vm_vcpu_add_default(vm, cpu, guest_code);
 	if (!first_cpu_done) {
 		first_cpu_done = true;
 		vcpu_set_msr(vm, cpu, MSR_IA32_TSC, TEST_TSC_OFFSET);
 	}
 	pthread_spin_unlock(&create_lock);
 	for (;;) {
 		volatile struct kvm_run *run = vcpu_state(vm, cpu);
                struct ucall uc;
                vcpu_run(vm, cpu);
                TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
                            "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
                            run->exit_reason,
                            exit_reason_str(run->exit_reason));
                switch (get_ucall(vm, cpu, &uc)) {
                case UCALL_DONE:
 			goto out;
                case UCALL_SYNC:
 			printf("Guest %ld sync %lx %lx %ld\n", cpu, uc.args[2], uc.args[3], uc.args[2] - uc.args[3]);
 			failures++;
 			break;
                default:
                        TEST_FAIL("Unknown ucall %lu", uc.cmd);
 		}
 	}
 out:
 	return (void *)failures;
 }
 int main(int argc, char *argv[])
 {
        if (!kvm_check_cap(KVM_CAP_VM_TSC_CONTROL)) {
 		print_skip("KVM_CAP_VM_TSC_CONTROL not available");
 		exit(KSFT_SKIP);
 	}
 	vm = vm_create_default_with_vcpus(0, DEFAULT_STACK_PGS * NR_TEST_VCPUS, 0, guest_code, NULL);
 	vm_ioctl(vm, KVM_SET_TSC_KHZ, (void *) TEST_TSC_KHZ);
 	pthread_spin_init(&create_lock, PTHREAD_PROCESS_PRIVATE);
 	pthread_t cpu_threads[NR_TEST_VCPUS];
 	unsigned long cpu;
 	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++)
 		pthread_create(&cpu_threads[cpu], NULL, run_vcpu, (void *)cpu);
 	unsigned long failures = 0;
 	for (cpu = 0; cpu < NR_TEST_VCPUS; cpu++) {
 		void *this_cpu_failures;
 		pthread_join(cpu_threads[cpu], &this_cpu_failures);
 		failures += (unsigned long)this_cpu_failures;
 	}
 	TEST_ASSERT(!failures, "TSC sync failed");
 	pthread_spin_destroy(&create_lock);
 	kvm_vm_free(vm);
 	return 0;
 }
--- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c
@ -39,12 +39,36 @@
 #define EVTCHN_VECTOR	0x10
 #define EVTCHN_TEST1 15
 #define EVTCHN_TEST2 66
 #define EVTCHN_TIMER 13
 static struct kvm_vm *vm;
 #define XEN_HYPERCALL_MSR	0x40000000
 #define MIN_STEAL_TIME		50000
 #define __HYPERVISOR_set_timer_op	15
 #define __HYPERVISOR_sched_op		29
 #define __HYPERVISOR_event_channel_op	32
 #define SCHEDOP_poll			3
 #define EVTCHNOP_send			4
 #define EVTCHNSTAT_interdomain		2
 struct evtchn_send {
 	u32 port;
 };
 struct sched_poll {
 	u32 *ports;
 	unsigned int nr_ports;
 	u64 timeout;
 };
 struct pvclock_vcpu_time_info {
 	u32   version;
 	u32   pad0;
@ -107,15 +131,25 @@ struct {
 	struct kvm_irq_routing_entry entries[2];
 } irq_routes;
 bool guest_saw_irq;
 static void evtchn_handler(struct ex_regs *regs)
 {
 	struct vcpu_info *vi = (void *)VCPU_INFO_VADDR;
 	vi->evtchn_upcall_pending = 0;
 	vi->evtchn_pending_sel = 0;
 	guest_saw_irq = true;
 	GUEST_SYNC(0x20);
 }
 static void guest_wait_for_irq(void)
 {
 	while (!guest_saw_irq)
 		__asm__ __volatile__ ("rep nop" : : : "memory");
 	guest_saw_irq = false;
 }
 static void guest_code(void)
 {
 	struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR;
@ -128,6 +162,8 @@ static void guest_code(void)
 	/* Trigger an interrupt injection */
 	GUEST_SYNC(0);
 	guest_wait_for_irq();
 	/* Test having the host set runstates manually */
 	GUEST_SYNC(RUNSTATE_runnable);
 	GUEST_ASSERT(rs->time[RUNSTATE_runnable] != 0);
@ -168,14 +204,132 @@ static void guest_code(void)
 	/* Now deliver an *unmasked* interrupt */
 	GUEST_SYNC(8);
-	while (!si->evtchn_pending[1])
+	guest_wait_for_irq();
 		__asm__ __volatile__ ("rep nop" : : : "memory");
 	/* Change memslots and deliver an interrupt */
 	GUEST_SYNC(9);
-	for (;;)
+	guest_wait_for_irq();
-		__asm__ __volatile__ ("rep nop" : : : "memory");
+
 	/* Deliver event channel with KVM_XEN_HVM_EVTCHN_SEND */
 	GUEST_SYNC(10);
 	guest_wait_for_irq();
 	GUEST_SYNC(11);
 	/* Our turn. Deliver event channel (to ourselves) with
 	 * EVTCHNOP_send hypercall. */
 	unsigned long rax;
 	struct evtchn_send s = { .port = 127 };
 	__asm__ __volatile__ ("vmcall" :
 			      "=a" (rax) :
 			      "a" (__HYPERVISOR_event_channel_op),
 			      "D" (EVTCHNOP_send),
 			      "S" (&s));
 	GUEST_ASSERT(rax == 0);
 	guest_wait_for_irq();
 	GUEST_SYNC(12);
 	/* Deliver "outbound" event channel to an eventfd which
 	 * happens to be one of our own irqfds. */
 	s.port = 197;
 	__asm__ __volatile__ ("vmcall" :
 			      "=a" (rax) :
 			      "a" (__HYPERVISOR_event_channel_op),
 			      "D" (EVTCHNOP_send),
 			      "S" (&s));
 	GUEST_ASSERT(rax == 0);
 	guest_wait_for_irq();
 	GUEST_SYNC(13);
 	/* Set a timer 100ms in the future. */
 	__asm__ __volatile__ ("vmcall" :
 			      "=a" (rax) :
 			      "a" (__HYPERVISOR_set_timer_op),
 			      "D" (rs->state_entry_time + 100000000));
 	GUEST_ASSERT(rax == 0);
 	GUEST_SYNC(14);
 	/* Now wait for the timer */
 	guest_wait_for_irq();
 	GUEST_SYNC(15);
 	/* The host has 'restored' the timer. Just wait for it. */
 	guest_wait_for_irq();
 	GUEST_SYNC(16);
 	/* Poll for an event channel port which is already set */
 	u32 ports[1] = { EVTCHN_TIMER };
 	struct sched_poll p = {
 		.ports = ports,
 		.nr_ports = 1,
 		.timeout = 0,
 	};
 	__asm__ __volatile__ ("vmcall" :
 			      "=a" (rax) :
 			      "a" (__HYPERVISOR_sched_op),
 			      "D" (SCHEDOP_poll),
 			      "S" (&p));
 	GUEST_ASSERT(rax == 0);
 	GUEST_SYNC(17);
 	/* Poll for an unset port and wait for the timeout. */
 	p.timeout = 100000000;
 	__asm__ __volatile__ ("vmcall" :
 			      "=a" (rax) :
 			      "a" (__HYPERVISOR_sched_op),
 			      "D" (SCHEDOP_poll),
 			      "S" (&p));
 	GUEST_ASSERT(rax == 0);
 	GUEST_SYNC(18);
 	/* A timer will wake the masked port we're waiting on, while we poll */
 	p.timeout = 0;
 	__asm__ __volatile__ ("vmcall" :
 			      "=a" (rax) :
 			      "a" (__HYPERVISOR_sched_op),
 			      "D" (SCHEDOP_poll),
 			      "S" (&p));
 	GUEST_ASSERT(rax == 0);
 	GUEST_SYNC(19);
 	/* A timer wake an *unmasked* port which should wake us with an
 	 * actual interrupt, while we're polling on a different port. */
 	ports[0]++;
 	p.timeout = 0;
 	__asm__ __volatile__ ("vmcall" :
 			      "=a" (rax) :
 			      "a" (__HYPERVISOR_sched_op),
 			      "D" (SCHEDOP_poll),
 			      "S" (&p));
 	GUEST_ASSERT(rax == 0);
 	guest_wait_for_irq();
 	GUEST_SYNC(20);
 	/* Timer should have fired already */
 	guest_wait_for_irq();
 	GUEST_SYNC(21);
 }
 static int cmp_timespec(struct timespec *a, struct timespec *b)
@ -191,9 +345,13 @@ static int cmp_timespec(struct timespec *a, struct timespec *b)
 	else
 		return 0;
 }
 struct vcpu_info *vinfo;
 static void handle_alrm(int sig)
 {
 	if (vinfo)
 		printf("evtchn_upcall_pending 0x%x\n", vinfo->evtchn_upcall_pending);
 	vcpu_dump(stdout, vm, VCPU_ID, 0);
 	TEST_FAIL("IRQ delivery timed out");
 }
@ -213,6 +371,7 @@ int main(int argc, char *argv[])
 	bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE);
 	bool do_eventfd_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL);
 	bool do_evtchn_tests = do_eventfd_tests && !!(xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND);
 	clock_gettime(CLOCK_REALTIME, &min_ts);
@ -233,6 +392,12 @@ int main(int argc, char *argv[])
 		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
 		.msr = XEN_HYPERCALL_MSR,
 	};
 	/* Let the kernel know that we *will* use it for sending all
 	 * event channels, which lets it intercept SCHEDOP_poll */
 	if (do_evtchn_tests)
 		hvmc.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
 	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
 	struct kvm_xen_hvm_attr lm = {
@ -295,7 +460,7 @@ int main(int argc, char *argv[])
 		/* Unexpected, but not a KVM failure */
 		if (irq_fd[0] == -1 || irq_fd[1] == -1)
-			do_eventfd_tests = false;
+			do_evtchn_tests = do_eventfd_tests = false;
 	}
 	if (do_eventfd_tests) {
@ -303,13 +468,13 @@ int main(int argc, char *argv[])
 		irq_routes.entries[0].gsi = 32;
 		irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
-		irq_routes.entries[0].u.xen_evtchn.port = 15;
+		irq_routes.entries[0].u.xen_evtchn.port = EVTCHN_TEST1;
 		irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID;
 		irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
 		irq_routes.entries[1].gsi = 33;
 		irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN;
-		irq_routes.entries[1].u.xen_evtchn.port = 66;
+		irq_routes.entries[1].u.xen_evtchn.port = EVTCHN_TEST2;
 		irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID;
 		irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
@ -330,7 +495,39 @@ int main(int argc, char *argv[])
 		sigaction(SIGALRM, &sa, NULL);
 	}
-	struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
+	struct kvm_xen_vcpu_attr tmr = {
 		.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
 		.u.timer.port = EVTCHN_TIMER,
 		.u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
 		.u.timer.expires_ns = 0
 	};
 	if (do_evtchn_tests) {
 		struct kvm_xen_hvm_attr inj = {
 			.type = KVM_XEN_ATTR_TYPE_EVTCHN,
 			.u.evtchn.send_port = 127,
 			.u.evtchn.type = EVTCHNSTAT_interdomain,
 			.u.evtchn.flags = 0,
 			.u.evtchn.deliver.port.port = EVTCHN_TEST1,
 			.u.evtchn.deliver.port.vcpu = VCPU_ID + 1,
 			.u.evtchn.deliver.port.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
 		};
 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
 		/* Test migration to a different vCPU */
 		inj.u.evtchn.flags = KVM_XEN_EVTCHN_UPDATE;
 		inj.u.evtchn.deliver.port.vcpu = VCPU_ID;
 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
 		inj.u.evtchn.send_port = 197;
 		inj.u.evtchn.deliver.eventfd.port = 0;
 		inj.u.evtchn.deliver.eventfd.fd = irq_fd[1];
 		inj.u.evtchn.flags = 0;
 		vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &inj);
 		vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 	}
 	vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR);
 	vinfo->evtchn_upcall_pending = 0;
 	struct vcpu_runstate_info *rs = addr_gpa2hva(vm, RUNSTATE_ADDR);
@ -423,7 +620,7 @@ int main(int argc, char *argv[])
 					goto done;
 				if (verbose)
 					printf("Testing masked event channel\n");
-				shinfo->evtchn_mask[0] = 0x8000;
+				shinfo->evtchn_mask[0] = 1UL << EVTCHN_TEST1;
 				eventfd_write(irq_fd[0], 1UL);
 				alarm(1);
 				break;
@ -440,6 +637,9 @@ int main(int argc, char *argv[])
 				break;
 			case 9:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				shinfo->evtchn_pending[1] = 0;
 				if (verbose)
 					printf("Testing event channel after memslot change\n");
 				vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
@ -449,12 +649,153 @@ int main(int argc, char *argv[])
 				alarm(1);
 				break;
 			case 10:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				if (!do_evtchn_tests)
 					goto done;
 				shinfo->evtchn_pending[0] = 0;
 				if (verbose)
 					printf("Testing injection with KVM_XEN_HVM_EVTCHN_SEND\n");
 				struct kvm_irq_routing_xen_evtchn e;
 				e.port = EVTCHN_TEST2;
 				e.vcpu = VCPU_ID;
 				e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
 				vm_ioctl(vm, KVM_XEN_HVM_EVTCHN_SEND, &e);
 				evtchn_irq_expected = true;
 				alarm(1);
 				break;
 			case 11:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				shinfo->evtchn_pending[1] = 0;
 				if (verbose)
 					printf("Testing guest EVTCHNOP_send direct to evtchn\n");
 				evtchn_irq_expected = true;
 				alarm(1);
 				break;
 			case 12:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				shinfo->evtchn_pending[0] = 0;
 				if (verbose)
 					printf("Testing guest EVTCHNOP_send to eventfd\n");
 				evtchn_irq_expected = true;
 				alarm(1);
 				break;
 			case 13:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				shinfo->evtchn_pending[1] = 0;
 				if (verbose)
 					printf("Testing guest oneshot timer\n");
 				break;
 			case 14:
 				memset(&tmr, 0, sizeof(tmr));
 				tmr.type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
 				TEST_ASSERT(tmr.u.timer.port == EVTCHN_TIMER,
 					    "Timer port not returned");
 				TEST_ASSERT(tmr.u.timer.priority == KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
 					    "Timer priority not returned");
 				TEST_ASSERT(tmr.u.timer.expires_ns > rs->state_entry_time,
 					    "Timer expiry not returned");
 				evtchn_irq_expected = true;
 				alarm(1);
 				break;
 			case 15:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				shinfo->evtchn_pending[0] = 0;
 				if (verbose)
 					printf("Testing restored oneshot timer\n");
 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000,
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 				evtchn_irq_expected = true;
 				alarm(1);
 				break;
 			case 16:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				if (verbose)
 					printf("Testing SCHEDOP_poll with already pending event\n");
 				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 1UL << EVTCHN_TIMER;
 				alarm(1);
 				break;
 			case 17:
 				if (verbose)
 					printf("Testing SCHEDOP_poll timeout\n");
 				shinfo->evtchn_pending[0] = 0;
 				alarm(1);
 				break;
 			case 18:
 				if (verbose)
 					printf("Testing SCHEDOP_poll wake on masked event\n");
 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000,
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 				alarm(1);
 				break;
 			case 19:
 				shinfo->evtchn_pending[0] = shinfo->evtchn_mask[0] = 0;
 				if (verbose)
 					printf("Testing SCHEDOP_poll wake on unmasked event\n");
 				evtchn_irq_expected = true;
 				tmr.u.timer.expires_ns = rs->state_entry_time + 100000000;
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 				/* Read it back and check the pending time is reported correctly */
 				tmr.u.timer.expires_ns = 0;
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
 				TEST_ASSERT(tmr.u.timer.expires_ns == rs->state_entry_time + 100000000,
 					    "Timer not reported pending");
 				alarm(1);
 				break;
 			case 20:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				/* Read timer and check it is no longer pending */
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_GET_ATTR, &tmr);
 				TEST_ASSERT(!tmr.u.timer.expires_ns, "Timer still reported pending");
 				shinfo->evtchn_pending[0] = 0;
 				if (verbose)
 					printf("Testing timer in the past\n");
 				evtchn_irq_expected = true;
 				tmr.u.timer.expires_ns = rs->state_entry_time - 100000000ULL;
 				vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &tmr);
 				alarm(1);
 				break;
 			case 21:
 				TEST_ASSERT(!evtchn_irq_expected,
 					    "Expected event channel IRQ but it didn't happen");
 				goto done;
 			case 0x20:
 				TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ");
 				evtchn_irq_expected = false;
 				if (shinfo->evtchn_pending[1] &&
 				    shinfo->evtchn_pending[0])
 					goto done;
 				break;
 			}
 			break;
@ -467,6 +808,7 @@ int main(int argc, char *argv[])
 	}
 done:
 	alarm(0);
 	clock_gettime(CLOCK_REALTIME, &max_ts);
 	/*