X86 interrupt handling update:

Support for posted interrupts on bare metal
 
     Posted interrupts is a virtualization feature which allows to inject
     interrupts directly into a guest without host interaction. The VT-d
     interrupt remapping hardware sets the bit which corresponds to the
     interrupt vector in a vector bitmap which is either used to inject the
     interrupt directly into the guest via a virtualized APIC or in case
     that the guest is scheduled out provides a host side notification
     interrupt which informs the host that an interrupt has been marked
     pending in the bitmap.
 
     This can be utilized on bare metal for scenarios where multiple
     devices, e.g. NVME storage, raise interrupts with a high frequency.  In
     the default mode these interrupts are handles independently and
     therefore require a full roundtrip of interrupt entry/exit.
 
     Utilizing posted interrupts this roundtrip overhead can be avoided by
     coalescing these interrupt entries to a single entry for the posted
     interrupt notification. The notification interrupt then demultiplexes
     the pending bits in a memory based bitmap and invokes the corresponding
     device specific handlers.
 
     Depending on the usage scenario and device utilization throughput
     improvements between 10% and 130% have been measured.
 
     As this is only relevant for high end servers with multiple device
     queues per CPU attached and counterproductive for situations where
     interrupts are arriving at distinct times, the functionality is opt-in
     via a kernel command line parameter.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmZBGUITHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYod3xD/98Xa4qZN7eceyyGUhgXnPLOKQzGQ7k
 7cmhsoAYjABeXLvuAvtKePL7ky7OPcqVW2E/g0+jdZuRDkRDbnVkM7CDMRTyL0/b
 BZLhVAXyANKjK79a5WvjL0zDasYQRQ16MQJ6TPa++mX0KhZSI7KvXWIqPWov5i02
 n8UbPUraH5bJi3qGKm6u4n2261Be1gtDag0ZjmGma45/3wsn3bWPoB7iPK6qxmq3
 Q7VARPXAcRp5wYACk6mCOM1dOXMUV9CgI5AUk92xGfXi4RAdsFeNSzeQWn9jHWOf
 CYbbJjNl4QmGP4IWmy6/Up4vIiEhUCOT2DmHsygrQTs/G+nPnMAe1qUuDuECiofj
 iToBL3hn1dHG8uINKOB81MJ33QEGWyYWY8PxxoR3LMTrhVpfChUlJO8T2XK5nu+i
 2EA6XLtJiHacpXhn8HQam0aQN9nvi4wT1LzpkhmboyCQuXTiXuJNbyLIh5TdFa1n
 DzqAGhRB67z6eGevJJ7kTI1X71W0poMwYlzCU8itnLOK8np0zFQ8bgwwqm9opZGq
 V2eSDuZAbqXVolzmaF8NSfM+b/R9URQtWsZ8cEc+/OdVV4HR4zfeqejy60TuV/4G
 39CTnn8vPBKcRSS6CAcJhKPhzIvHw4EMhoU4DJKBtwBdM58RyP9NY1wF3rIPJIGh
 sl61JBuYYuIZXg==
 =bqLN
 -----END PGP SIGNATURE-----

Merge tag 'x86-irq-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 interrupt handling updates from Thomas Gleixner:
 "Add support for posted interrupts on bare metal.

  Posted interrupts is a virtualization feature which allows to inject
  interrupts directly into a guest without host interaction. The VT-d
  interrupt remapping hardware sets the bit which corresponds to the
  interrupt vector in a vector bitmap which is either used to inject the
  interrupt directly into the guest via a virtualized APIC or in case
  that the guest is scheduled out provides a host side notification
  interrupt which informs the host that an interrupt has been marked
  pending in the bitmap.

  This can be utilized on bare metal for scenarios where multiple
  devices, e.g. NVME storage, raise interrupts with a high frequency. In
  the default mode these interrupts are handles independently and
  therefore require a full roundtrip of interrupt entry/exit.

  Utilizing posted interrupts this roundtrip overhead can be avoided by
  coalescing these interrupt entries to a single entry for the posted
  interrupt notification. The notification interrupt then demultiplexes
  the pending bits in a memory based bitmap and invokes the
  corresponding device specific handlers.

  Depending on the usage scenario and device utilization throughput
  improvements between 10% and 130% have been measured.

  As this is only relevant for high end servers with multiple device
  queues per CPU attached and counterproductive for situations where
  interrupts are arriving at distinct times, the functionality is opt-in
  via a kernel command line parameter"

* tag 'x86-irq-2024-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/irq: Use existing helper for pending vector check
  iommu/vt-d: Enable posted mode for device MSIs
  iommu/vt-d: Make posted MSI an opt-in command line option
  x86/irq: Extend checks for pending vectors to posted interrupts
  x86/irq: Factor out common code for checking pending interrupts
  x86/irq: Install posted MSI notification handler
  x86/irq: Factor out handler invocation from common_interrupt()
  x86/irq: Set up per host CPU posted interrupt descriptors
  x86/irq: Reserve a per CPU IDT vector for posted MSIs
  x86/irq: Add a Kconfig option for posted MSI
  x86/irq: Remove bitfields in posted interrupt descriptor
  x86/irq: Unionize PID.PIR for 64bit access w/o casting
  KVM: VMX: Move posted interrupt descriptor out of VMX code
This commit is contained in:
Linus Torvalds 2024-05-14 10:01:29 -07:00
commit 9776dd3609
19 changed files with 450 additions and 119 deletions

View File

@ -2251,6 +2251,8 @@
no_x2apic_optout no_x2apic_optout
BIOS x2APIC opt-out request will be ignored BIOS x2APIC opt-out request will be ignored
nopost disable Interrupt Posting nopost disable Interrupt Posting
posted_msi
enable MSIs delivered as posted interrupts
iomem= Disable strict checking of access to MMIO memory iomem= Disable strict checking of access to MMIO memory
strict regions from userspace. strict regions from userspace.

View File

@ -466,6 +466,17 @@ config X86_X2APIC
If you don't know what to do here, say N. If you don't know what to do here, say N.
config X86_POSTED_MSI
bool "Enable MSI and MSI-x delivery by posted interrupts"
depends on X86_64 && IRQ_REMAP
help
This enables MSIs that are under interrupt remapping to be delivered as
posted interrupts to the host kernel. Interrupt throughput can
potentially be improved by coalescing CPU notifications during high
frequency bursts.
If you don't know what to do here, say N.
config X86_MPPARSE config X86_MPPARSE
bool "Enable MPS table" if ACPI bool "Enable MPS table" if ACPI
default y default y

View File

@ -117,6 +117,8 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = {
SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, posted_msi_notification),
}; };
static bool fred_setup_done __initdata; static bool fred_setup_done __initdata;

View File

@ -14,6 +14,7 @@
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/hardirq.h> #include <asm/hardirq.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/posted_intr.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1 #define ARCH_APICTIMER_STOPS_ON_C3 1
@ -500,6 +501,11 @@ static inline bool lapic_vector_set_in_irr(unsigned int vector)
return !!(irr & (1U << (vector % 32))); return !!(irr & (1U << (vector % 32)));
} }
static inline bool is_vector_pending(unsigned int vector)
{
return lapic_vector_set_in_irr(vector) || pi_pending_this_cpu(vector);
}
/* /*
* Warm reset vector position: * Warm reset vector position:
*/ */

View File

@ -44,10 +44,16 @@ typedef struct {
unsigned int irq_hv_reenlightenment_count; unsigned int irq_hv_reenlightenment_count;
unsigned int hyperv_stimer0_count; unsigned int hyperv_stimer0_count;
#endif #endif
#ifdef CONFIG_X86_POSTED_MSI
unsigned int posted_msi_notification_count;
#endif
} ____cacheline_aligned irq_cpustat_t; } ____cacheline_aligned irq_cpustat_t;
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
#ifdef CONFIG_X86_POSTED_MSI
DECLARE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
#endif
#define __ARCH_IRQ_STAT #define __ARCH_IRQ_STAT
#define inc_irq_stat(member) this_cpu_inc(irq_stat.member) #define inc_irq_stat(member) this_cpu_inc(irq_stat.member)

View File

@ -751,6 +751,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested
# define fred_sysvec_kvm_posted_intr_nested_ipi NULL # define fred_sysvec_kvm_posted_intr_nested_ipi NULL
#endif #endif
# ifdef CONFIG_X86_POSTED_MSI
DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification);
#else
# define fred_sysvec_posted_msi_notification NULL
# endif
#if IS_ENABLED(CONFIG_HYPERV) #if IS_ENABLED(CONFIG_HYPERV)
DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);

View File

@ -50,6 +50,13 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
return x86_vector_domain; return x86_vector_domain;
} }
extern bool enable_posted_msi;
static inline bool posted_msi_supported(void)
{
return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP);
}
#else /* CONFIG_IRQ_REMAP */ #else /* CONFIG_IRQ_REMAP */
static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; } static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }

View File

@ -97,10 +97,16 @@
#define LOCAL_TIMER_VECTOR 0xec #define LOCAL_TIMER_VECTOR 0xec
/*
* Posted interrupt notification vector for all device MSIs delivered to
* the host kernel.
*/
#define POSTED_MSI_NOTIFICATION_VECTOR 0xeb
#define NR_VECTORS 256 #define NR_VECTORS 256
#ifdef CONFIG_X86_LOCAL_APIC #ifdef CONFIG_X86_LOCAL_APIC
#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR #define FIRST_SYSTEM_VECTOR POSTED_MSI_NOTIFICATION_VECTOR
#else #else
#define FIRST_SYSTEM_VECTOR NR_VECTORS #define FIRST_SYSTEM_VECTOR NR_VECTORS
#endif #endif

View File

@ -0,0 +1,118 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_POSTED_INTR_H
#define _X86_POSTED_INTR_H
#include <asm/irq_vectors.h>
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
#define PID_TABLE_ENTRY_VALID 1
/* Posted-Interrupt Descriptor */
struct pi_desc {
union {
u32 pir[8]; /* Posted interrupt requested */
u64 pir64[4];
};
union {
struct {
u16 notifications; /* Suppress and outstanding bits */
u8 nv;
u8 rsvd_2;
u32 ndst;
};
u64 control;
};
u32 rsvd[6];
} __aligned(64);
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
{
return test_and_clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
{
return test_and_clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
{
return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
}
static inline void pi_set_sn(struct pi_desc *pi_desc)
{
set_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
}
static inline void pi_set_on(struct pi_desc *pi_desc)
{
set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
}
static inline void pi_clear_on(struct pi_desc *pi_desc)
{
clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
}
static inline void pi_clear_sn(struct pi_desc *pi_desc)
{
clear_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
}
static inline bool pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
}
static inline bool pi_test_sn(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control);
}
/* Non-atomic helpers */
static inline void __pi_set_sn(struct pi_desc *pi_desc)
{
pi_desc->notifications |= BIT(POSTED_INTR_SN);
}
static inline void __pi_clear_sn(struct pi_desc *pi_desc)
{
pi_desc->notifications &= ~BIT(POSTED_INTR_SN);
}
#ifdef CONFIG_X86_POSTED_MSI
/*
* Not all external vectors are subject to interrupt remapping, e.g. IOMMU's
* own interrupts. Here we do not distinguish them since those vector bits in
* PIR will always be zero.
*/
static inline bool pi_pending_this_cpu(unsigned int vector)
{
struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc);
if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
return false;
return test_bit(vector, (unsigned long *)pid->pir);
}
extern void intel_posted_msi_init(void);
#else
static inline bool pi_pending_this_cpu(unsigned int vector) { return false; }
static inline void intel_posted_msi_init(void) {};
#endif /* X86_POSTED_MSI */
#endif /* _X86_POSTED_INTR_H */

View File

@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
lockdep_assert_held(&vector_lock); lockdep_assert_held(&vector_lock);
hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
unsigned int irr, vector = apicd->prev_vector; unsigned int vector = apicd->prev_vector;
/* /*
* Paranoia: Check if the vector that needs to be cleaned * Paranoia: Check if the vector that needs to be cleaned
@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
* fixup_irqs() was just called to scan IRR for set bits and * fixup_irqs() was just called to scan IRR for set bits and
* forward them to new destination CPUs via IPIs. * forward them to new destination CPUs via IPIs.
*/ */
irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; if (check_irr && is_vector_pending(vector)) {
if (irr & (1U << (vector % 32))) {
pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
rearm = true; rearm = true;
continue; continue;

View File

@ -68,6 +68,7 @@
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/sev.h> #include <asm/sev.h>
#include <asm/tdx.h> #include <asm/tdx.h>
#include <asm/posted_intr.h>
#include "cpu.h" #include "cpu.h"
@ -2222,6 +2223,8 @@ void cpu_init(void)
barrier(); barrier();
x2apic_setup(); x2apic_setup();
intel_posted_msi_init();
} }
mmgrab(&init_mm); mmgrab(&init_mm);

View File

@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = {
# endif # endif
INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
# ifdef CONFIG_X86_POSTED_MSI
INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification),
# endif
#endif #endif
}; };

View File

@ -22,6 +22,8 @@
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/thermal.h> #include <asm/thermal.h>
#include <asm/posted_intr.h>
#include <asm/irq_remapping.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h> #include <asm/trace/irq_vectors.h>
@ -181,6 +183,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", seq_printf(p, "%10u ",
irq_stats(j)->kvm_posted_intr_wakeup_ipis); irq_stats(j)->kvm_posted_intr_wakeup_ipis);
seq_puts(p, " Posted-interrupt wakeup event\n"); seq_puts(p, " Posted-interrupt wakeup event\n");
#endif
#ifdef CONFIG_X86_POSTED_MSI
seq_printf(p, "%*s: ", prec, "PMN");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
irq_stats(j)->posted_msi_notification_count);
seq_puts(p, " Posted MSI notification event\n");
#endif #endif
return 0; return 0;
} }
@ -240,24 +249,16 @@ static __always_inline void handle_irq(struct irq_desc *desc,
__handle_irq(desc, regs); __handle_irq(desc, regs);
} }
/* static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
* common_interrupt() handles all normal device IRQ's (the special SMP
* cross-CPU interrupts have their own entry points).
*/
DEFINE_IDTENTRY_IRQ(common_interrupt)
{ {
struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc; struct irq_desc *desc;
int ret = 0;
/* entry code tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
desc = __this_cpu_read(vector_irq[vector]); desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) { if (likely(!IS_ERR_OR_NULL(desc))) {
handle_irq(desc, regs); handle_irq(desc, regs);
} else { } else {
apic_eoi(); ret = -EINVAL;
if (desc == VECTOR_UNUSED) { if (desc == VECTOR_UNUSED) {
pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
__func__, smp_processor_id(), __func__, smp_processor_id(),
@ -267,6 +268,23 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
} }
} }
return ret;
}
/*
* common_interrupt() handles all normal device IRQ's (the special SMP
* cross-CPU interrupts have their own entry points).
*/
DEFINE_IDTENTRY_IRQ(common_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
/* entry code tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
if (unlikely(call_irq_handler(vector, regs)))
apic_eoi();
set_irq_regs(old_regs); set_irq_regs(old_regs);
} }
@ -334,12 +352,139 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
} }
#endif #endif
#ifdef CONFIG_X86_POSTED_MSI
/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
void intel_posted_msi_init(void)
{
u32 destination;
u32 apic_id;
this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
/*
* APIC destination ID is stored in bit 8:15 while in XAPIC mode.
* VT-d spec. CH 9.11
*/
apic_id = this_cpu_read(x86_cpu_to_apicid);
destination = x2apic_enabled() ? apic_id : apic_id << 8;
this_cpu_write(posted_msi_pi_desc.ndst, destination);
}
/*
* De-multiplexing posted interrupts is on the performance path, the code
* below is written to optimize the cache performance based on the following
* considerations:
* 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
* accessed by both CPU and IOMMU.
* 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
* for checking and clearing posted interrupt request (PIR), a 256 bit field
* within the PID.
* 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
* line when posting interrupts and setting control bits.
* 4.The CPU can access the cache line a magnitude faster than the IOMMU.
* 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
* cache line. The cache line states after each operation are as follows:
* CPU IOMMU PID Cache line state
* ---------------------------------------------------------------
*...read64 exclusive
*...lock xchg64 modified
*... post/atomic swap invalid
*...-------------------------------------------------------------
*
* To reduce L1 data cache miss, it is important to avoid contention with
* IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
* to dispatch interrupt handlers.
*
* In addition, the code is trying to keep the cache line state consistent
* as much as possible. e.g. when making a copy and clearing the PIR
* (assuming non-zero PIR bits are present in the entire PIR), it does:
* read, read, read, read, xchg, xchg, xchg, xchg
* instead of:
* read, xchg, read, xchg, read, xchg, read, xchg
*/
static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
{
int i, vec = FIRST_EXTERNAL_VECTOR;
unsigned long pir_copy[4];
bool handled = false;
for (i = 0; i < 4; i++)
pir_copy[i] = pir[i];
for (i = 0; i < 4; i++) {
if (!pir_copy[i])
continue;
pir_copy[i] = arch_xchg(&pir[i], 0);
handled = true;
}
if (handled) {
for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
call_irq_handler(vec, regs);
}
return handled;
}
/*
* Performance data shows that 3 is good enough to harvest 90+% of the benefit
* on high IRQ rate workload.
*/
#define MAX_POSTED_MSI_COALESCING_LOOP 3
/*
* For MSIs that are delivered as posted interrupts, the CPU notifications
* can be coalesced if the MSIs arrive in high frequency bursts.
*/
DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
{
struct pt_regs *old_regs = set_irq_regs(regs);
struct pi_desc *pid;
int i = 0;
pid = this_cpu_ptr(&posted_msi_pi_desc);
inc_irq_stat(posted_msi_notification_count);
irq_enter();
/*
* Max coalescing count includes the extra round of handle_pending_pir
* after clearing the outstanding notification bit. Hence, at most
* MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
*/
while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
if (!handle_pending_pir(pid->pir64, regs))
break;
}
/*
* Clear outstanding notification bit to allow new IRQ notifications,
* do this last to maximize the window of interrupt coalescing.
*/
pi_clear_on(pid);
/*
* There could be a race of PI notification and the clearing of ON bit,
* process PIR bits one last time such that handling the new interrupts
* are not delayed until the next IRQ.
*/
handle_pending_pir(pid->pir64, regs);
apic_eoi();
irq_exit();
set_irq_regs(old_regs);
}
#endif /* X86_POSTED_MSI */
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void) void fixup_irqs(void)
{ {
unsigned int irr, vector; unsigned int vector;
struct irq_desc *desc; struct irq_desc *desc;
struct irq_data *data; struct irq_data *data;
struct irq_chip *chip; struct irq_chip *chip;
@ -366,8 +511,7 @@ void fixup_irqs(void)
if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue; continue;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); if (is_vector_pending(vector)) {
if (irr & (1 << (vector % 32))) {
desc = __this_cpu_read(vector_irq[vector]); desc = __this_cpu_read(vector_irq[vector]);
raw_spin_lock(&desc->lock); raw_spin_lock(&desc->lock);

View File

@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* handle task migration (@cpu != vcpu->cpu). * handle task migration (@cpu != vcpu->cpu).
*/ */
new.ndst = dest; new.ndst = dest;
new.sn = 0; __pi_clear_sn(&new);
/* /*
* Restore the notification vector; in the blocking case, the * Restore the notification vector; in the blocking case, the
@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");
old.control = READ_ONCE(pi_desc->control); old.control = READ_ONCE(pi_desc->control);
do { do {

View File

@ -1,98 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KVM_X86_VMX_POSTED_INTR_H #ifndef __KVM_X86_VMX_POSTED_INTR_H
#define __KVM_X86_VMX_POSTED_INTR_H #define __KVM_X86_VMX_POSTED_INTR_H
#include <asm/posted_intr.h>
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
#define PID_TABLE_ENTRY_VALID 1
/* Posted-Interrupt Descriptor */
struct pi_desc {
u32 pir[8]; /* Posted interrupt requested */
union {
struct {
/* bit 256 - Outstanding Notification */
u16 on : 1,
/* bit 257 - Suppress Notification */
sn : 1,
/* bit 271:258 - Reserved */
rsvd_1 : 14;
/* bit 279:272 - Notification Vector */
u8 nv;
/* bit 287:280 - Reserved */
u8 rsvd_2;
/* bit 319:288 - Notification Destination */
u32 ndst;
};
u64 control;
};
u32 rsvd[6];
} __aligned(64);
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
return test_and_set_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
{
return test_and_clear_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
{
return test_and_clear_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
}
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
{
return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
}
static inline void pi_set_sn(struct pi_desc *pi_desc)
{
set_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
}
static inline void pi_set_on(struct pi_desc *pi_desc)
{
set_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
static inline void pi_clear_on(struct pi_desc *pi_desc)
{
clear_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
static inline void pi_clear_sn(struct pi_desc *pi_desc)
{
clear_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
}
static inline bool pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
static inline bool pi_test_sn(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
}
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);

View File

@ -70,6 +70,7 @@
#include "x86.h" #include "x86.h"
#include "smm.h" #include "smm.h"
#include "vmx_onhyperv.h" #include "vmx_onhyperv.h"
#include "posted_intr.h"
MODULE_AUTHOR("Qumranet"); MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
@ -4844,7 +4845,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* or POSTED_INTR_WAKEUP_VECTOR. * or POSTED_INTR_WAKEUP_VECTOR.
*/ */
vmx->pi_desc.nv = POSTED_INTR_VECTOR; vmx->pi_desc.nv = POSTED_INTR_VECTOR;
vmx->pi_desc.sn = 1; __pi_set_sn(&vmx->pi_desc);
} }
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)

View File

@ -7,10 +7,10 @@
#include <asm/kvm.h> #include <asm/kvm.h>
#include <asm/intel_pt.h> #include <asm/intel_pt.h>
#include <asm/perf_event.h> #include <asm/perf_event.h>
#include <asm/posted_intr.h>
#include "capabilities.h" #include "capabilities.h"
#include "../kvm_cache_regs.h" #include "../kvm_cache_regs.h"
#include "posted_intr.h"
#include "vmcs.h" #include "vmcs.h"
#include "vmx_ops.h" #include "vmx_ops.h"
#include "../cpuid.h" #include "../cpuid.h"

View File

@ -19,6 +19,7 @@
#include <asm/cpu.h> #include <asm/cpu.h>
#include <asm/irq_remapping.h> #include <asm/irq_remapping.h>
#include <asm/pci-direct.h> #include <asm/pci-direct.h>
#include <asm/posted_intr.h>
#include "iommu.h" #include "iommu.h"
#include "../irq_remapping.h" #include "../irq_remapping.h"
@ -49,6 +50,7 @@ struct irq_2_iommu {
u16 sub_handle; u16 sub_handle;
u8 irte_mask; u8 irte_mask;
enum irq_mode mode; enum irq_mode mode;
bool posted_msi;
}; };
struct intel_ir_data { struct intel_ir_data {
@ -1118,6 +1120,14 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
irte->redir_hint = 1; irte->redir_hint = 1;
} }
static void prepare_irte_posted(struct irte *irte)
{
memset(irte, 0, sizeof(*irte));
irte->present = 1;
irte->p_pst = 1;
}
struct irq_remap_ops intel_irq_remap_ops = { struct irq_remap_ops intel_irq_remap_ops = {
.prepare = intel_prepare_irq_remapping, .prepare = intel_prepare_irq_remapping,
.enable = intel_enable_irq_remapping, .enable = intel_enable_irq_remapping,
@ -1126,6 +1136,47 @@ struct irq_remap_ops intel_irq_remap_ops = {
.enable_faulting = enable_drhd_fault_handling, .enable_faulting = enable_drhd_fault_handling,
}; };
#ifdef CONFIG_X86_POSTED_MSI
static phys_addr_t get_pi_desc_addr(struct irq_data *irqd)
{
int cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
if (WARN_ON(cpu >= nr_cpu_ids))
return 0;
return __pa(per_cpu_ptr(&posted_msi_pi_desc, cpu));
}
static void intel_ir_reconfigure_irte_posted(struct irq_data *irqd)
{
struct intel_ir_data *ir_data = irqd->chip_data;
struct irte *irte = &ir_data->irte_entry;
struct irte irte_pi;
u64 pid_addr;
pid_addr = get_pi_desc_addr(irqd);
if (!pid_addr) {
pr_warn("Failed to setup IRQ %d for posted mode", irqd->irq);
return;
}
memset(&irte_pi, 0, sizeof(irte_pi));
/* The shared IRTE already be set up as posted during alloc_irte */
dmar_copy_shared_irte(&irte_pi, irte);
irte_pi.pda_l = (pid_addr >> (32 - PDA_LOW_BIT)) & ~(-1UL << PDA_LOW_BIT);
irte_pi.pda_h = (pid_addr >> 32) & ~(-1UL << PDA_HIGH_BIT);
modify_irte(&ir_data->irq_2_iommu, &irte_pi);
}
#else
static inline void intel_ir_reconfigure_irte_posted(struct irq_data *irqd) {}
#endif
static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
{ {
struct intel_ir_data *ir_data = irqd->chip_data; struct intel_ir_data *ir_data = irqd->chip_data;
@ -1139,8 +1190,9 @@ static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force)
irte->vector = cfg->vector; irte->vector = cfg->vector;
irte->dest_id = IRTE_DEST(cfg->dest_apicid); irte->dest_id = IRTE_DEST(cfg->dest_apicid);
/* Update the hardware only if the interrupt is in remapped mode. */ if (ir_data->irq_2_iommu.posted_msi)
if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING) intel_ir_reconfigure_irte_posted(irqd);
else if (force || ir_data->irq_2_iommu.mode == IRQ_REMAPPING)
modify_irte(&ir_data->irq_2_iommu, irte); modify_irte(&ir_data->irq_2_iommu, irte);
} }
@ -1194,7 +1246,7 @@ static int intel_ir_set_vcpu_affinity(struct irq_data *data, void *info)
struct intel_ir_data *ir_data = data->chip_data; struct intel_ir_data *ir_data = data->chip_data;
struct vcpu_data *vcpu_pi_info = info; struct vcpu_data *vcpu_pi_info = info;
/* stop posting interrupts, back to remapping mode */ /* stop posting interrupts, back to the default mode */
if (!vcpu_pi_info) { if (!vcpu_pi_info) {
modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry); modify_irte(&ir_data->irq_2_iommu, &ir_data->irte_entry);
} else { } else {
@ -1233,6 +1285,49 @@ static struct irq_chip intel_ir_chip = {
.irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
}; };
/*
* With posted MSIs, all vectors are multiplexed into a single notification
* vector. Devices MSIs are then dispatched in a demux loop where
* EOIs can be coalesced as well.
*
* "INTEL-IR-POST" IRQ chip does not do EOI on ACK, thus the dummy irq_ack()
* function. Instead EOI is performed by the posted interrupt notification
* handler.
*
* For the example below, 3 MSIs are coalesced into one CPU notification. Only
* one apic_eoi() is needed.
*
* __sysvec_posted_msi_notification()
* irq_enter();
* handle_edge_irq()
* irq_chip_ack_parent()
* dummy(); // No EOI
* handle_irq_event()
* driver_handler()
* handle_edge_irq()
* irq_chip_ack_parent()
* dummy(); // No EOI
* handle_irq_event()
* driver_handler()
* handle_edge_irq()
* irq_chip_ack_parent()
* dummy(); // No EOI
* handle_irq_event()
* driver_handler()
* apic_eoi()
* irq_exit()
*/
static void dummy_ack(struct irq_data *d) { }
static struct irq_chip intel_ir_chip_post_msi = {
.name = "INTEL-IR-POST",
.irq_ack = dummy_ack,
.irq_set_affinity = intel_ir_set_affinity,
.irq_compose_msi_msg = intel_ir_compose_msi_msg,
.irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
};
static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle) static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle)
{ {
memset(msg, 0, sizeof(*msg)); memset(msg, 0, sizeof(*msg));
@ -1274,6 +1369,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
break; break;
case X86_IRQ_ALLOC_TYPE_PCI_MSI: case X86_IRQ_ALLOC_TYPE_PCI_MSI:
case X86_IRQ_ALLOC_TYPE_PCI_MSIX: case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
if (posted_msi_supported()) {
prepare_irte_posted(irte);
data->irq_2_iommu.posted_msi = 1;
}
set_msi_sid(irte, set_msi_sid(irte,
pci_real_dma_dev(msi_desc_to_pci_dev(info->desc))); pci_real_dma_dev(msi_desc_to_pci_dev(info->desc)));
break; break;
@ -1361,7 +1461,12 @@ static int intel_irq_remapping_alloc(struct irq_domain *domain,
irq_data->hwirq = (index << 16) + i; irq_data->hwirq = (index << 16) + i;
irq_data->chip_data = ird; irq_data->chip_data = ird;
irq_data->chip = &intel_ir_chip; if (posted_msi_supported() &&
((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) ||
(info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX)))
irq_data->chip = &intel_ir_chip_post_msi;
else
irq_data->chip = &intel_ir_chip;
intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i); intel_irq_remapping_prepare_irte(ird, irq_cfg, info, index, i);
irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT); irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
} }

View File

@ -24,6 +24,8 @@ int no_x2apic_optout;
int disable_irq_post = 0; int disable_irq_post = 0;
bool enable_posted_msi __ro_after_init;
static int disable_irq_remap; static int disable_irq_remap;
static struct irq_remap_ops *remap_ops; static struct irq_remap_ops *remap_ops;
@ -70,7 +72,8 @@ static __init int setup_irqremap(char *str)
no_x2apic_optout = 1; no_x2apic_optout = 1;
else if (!strncmp(str, "nopost", 6)) else if (!strncmp(str, "nopost", 6))
disable_irq_post = 1; disable_irq_post = 1;
else if (IS_ENABLED(CONFIG_X86_POSTED_MSI) && !strncmp(str, "posted_msi", 10))
enable_posted_msi = true;
str += strcspn(str, ","); str += strcspn(str, ",");
while (*str == ',') while (*str == ',')
str++; str++;