linux/arch/x86/include/asm/hw_irq.h
Thomas Gleixner 551adc6057 x86/irq: Cure live lock in fixup_irqs()
Harry reported, that he's able to trigger a system freeze with cpu hot
unplug. The freeze turned out to be a live lock caused by recent changes in
irq_force_complete_move().

When fixup_irqs() and from there irq_force_complete_move() is called on the
dying cpu, then all other cpus are in stop machine an wait for the dying cpu
to complete the teardown. If there is a move of an interrupt pending then
irq_force_complete_move() sends the cleanup IPI to the cpus in the old_domain
mask and waits for them to clear the mask. That's obviously impossible as
those cpus are firmly stuck in stop machine with interrupts disabled.

I should have known that, but I completely overlooked it being concentrated on
the locking issues around the vectors. And the existance of the call to
__irq_complete_move() in the code, which actually sends the cleanup IPI made
it reasonable to wait for that cleanup to complete. That call was bogus even
before the recent changes as it was just a pointless distraction.

We have to look at two cases:

1) The move_in_progress flag of the interrupt is set

   This means the ioapic has been updated with the new vector, but it has not
   fired yet. In theory there is a race:

   set_ioapic(new_vector) <-- Interrupt is raised before update is effective,
   			      i.e. it's raised on the old vector. 

   So if the target cpu cannot handle that interrupt before the old vector is
   cleaned up, we get a spurious interrupt and in the worst case the ioapic
   irq line becomes stale, but my experiments so far have only resulted in
   spurious interrupts.

   But in case of cpu hotplug this should be a non issue because if the
   affinity update happens right before all cpus rendevouz in stop machine,
   there is no way that the interrupt can be blocked on the target cpu because
   all cpus loops first with interrupts enabled in stop machine, so the old
   vector is not yet cleaned up when the interrupt fires.

   So the only way to run into this issue is if the delivery of the interrupt
   on the apic/system bus would be delayed beyond the point where the target
   cpu disables interrupts in stop machine. I doubt that it can happen, but at
   least there is a theroretical chance. Virtualization might be able to
   expose this, but AFAICT the IOAPIC emulation is not as stupid as the real
   hardware.

   I've spent quite some time over the weekend to enforce that situation,
   though I was not able to trigger the delayed case.

2) The move_in_progress flag is not set and the old_domain cpu mask is not
   empty.

   That means, that an interrupt was delivered after the change and the
   cleanup IPI has been sent to the cpus in old_domain, but not all CPUs have
   responded to it yet.

In both cases we can assume that the next interrupt will arrive on the new
vector, so we can cleanup the old vectors on the cpus in the old_domain cpu
mask.

Fixes: 98229aa36c "x86/irq: Plug vector cleanup race"
Reported-by: Harry Junior <harryjr@outlook.fr>
Tested-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Joe Lawrence <joe.lawrence@stratus.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1603140931430.3657@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2016-03-18 14:51:06 +01:00

186 lines
4.6 KiB
C

#ifndef _ASM_X86_HW_IRQ_H
#define _ASM_X86_HW_IRQ_H
/*
* (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
*
* moved some of the old arch/i386/kernel/irq.h to here. VY
*
* IRQ/IPI changes taken from work by Thomas Radke
* <tomsoft@informatik.tu-chemnitz.de>
*
* hacked by Andi Kleen for x86-64.
* unified by tglx
*/
#include <asm/irq_vectors.h>
#ifndef __ASSEMBLY__
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/smp.h>
#include <linux/atomic.h>
#include <asm/irq.h>
#include <asm/sections.h>
/* Interrupt handlers registered during init_IRQ */
extern asmlinkage void apic_timer_interrupt(void);
extern asmlinkage void x86_platform_ipi(void);
extern asmlinkage void kvm_posted_intr_ipi(void);
extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
extern asmlinkage void error_interrupt(void);
extern asmlinkage void irq_work_interrupt(void);
extern asmlinkage void spurious_interrupt(void);
extern asmlinkage void thermal_interrupt(void);
extern asmlinkage void reschedule_interrupt(void);
extern asmlinkage void irq_move_cleanup_interrupt(void);
extern asmlinkage void reboot_interrupt(void);
extern asmlinkage void threshold_interrupt(void);
extern asmlinkage void deferred_error_interrupt(void);
extern asmlinkage void call_function_interrupt(void);
extern asmlinkage void call_function_single_interrupt(void);
#ifdef CONFIG_TRACING
/* Interrupt handlers registered during init_IRQ */
extern void trace_apic_timer_interrupt(void);
extern void trace_x86_platform_ipi(void);
extern void trace_error_interrupt(void);
extern void trace_irq_work_interrupt(void);
extern void trace_spurious_interrupt(void);
extern void trace_thermal_interrupt(void);
extern void trace_reschedule_interrupt(void);
extern void trace_threshold_interrupt(void);
extern void trace_deferred_error_interrupt(void);
extern void trace_call_function_interrupt(void);
extern void trace_call_function_single_interrupt(void);
#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
#define trace_reboot_interrupt reboot_interrupt
#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
#endif /* CONFIG_TRACING */
#ifdef CONFIG_X86_LOCAL_APIC
struct irq_data;
struct pci_dev;
struct msi_desc;
enum irq_alloc_type {
X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
X86_IRQ_ALLOC_TYPE_HPET,
X86_IRQ_ALLOC_TYPE_MSI,
X86_IRQ_ALLOC_TYPE_MSIX,
X86_IRQ_ALLOC_TYPE_DMAR,
X86_IRQ_ALLOC_TYPE_UV,
};
struct irq_alloc_info {
enum irq_alloc_type type;
u32 flags;
const struct cpumask *mask; /* CPU mask for vector allocation */
union {
int unused;
#ifdef CONFIG_HPET_TIMER
struct {
int hpet_id;
int hpet_index;
void *hpet_data;
};
#endif
#ifdef CONFIG_PCI_MSI
struct {
struct pci_dev *msi_dev;
irq_hw_number_t msi_hwirq;
};
#endif
#ifdef CONFIG_X86_IO_APIC
struct {
int ioapic_id;
int ioapic_pin;
int ioapic_node;
u32 ioapic_trigger : 1;
u32 ioapic_polarity : 1;
u32 ioapic_valid : 1;
struct IO_APIC_route_entry *ioapic_entry;
};
#endif
#ifdef CONFIG_DMAR_TABLE
struct {
int dmar_id;
void *dmar_data;
};
#endif
#ifdef CONFIG_HT_IRQ
struct {
int ht_pos;
int ht_idx;
struct pci_dev *ht_dev;
void *ht_update;
};
#endif
#ifdef CONFIG_X86_UV
struct {
int uv_limit;
int uv_blade;
unsigned long uv_offset;
char *uv_name;
};
#endif
#if IS_ENABLED(CONFIG_VMD)
struct {
struct msi_desc *desc;
};
#endif
};
};
struct irq_cfg {
unsigned int dest_apicid;
u8 vector;
u8 old_vector;
};
extern struct irq_cfg *irq_cfg(unsigned int irq);
extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
extern void lock_vector_lock(void);
extern void unlock_vector_lock(void);
extern void setup_vector_irq(int cpu);
#ifdef CONFIG_SMP
extern void send_cleanup_vector(struct irq_cfg *);
extern void irq_complete_move(struct irq_cfg *cfg);
#else
static inline void send_cleanup_vector(struct irq_cfg *c) { }
static inline void irq_complete_move(struct irq_cfg *c) { }
#endif
extern void apic_ack_edge(struct irq_data *data);
#else /* CONFIG_X86_LOCAL_APIC */
static inline void lock_vector_lock(void) {}
static inline void unlock_vector_lock(void) {}
#endif /* CONFIG_X86_LOCAL_APIC */
/* Statistics */
extern atomic_t irq_err_count;
extern atomic_t irq_mis_count;
extern void elcr_set_level_irq(unsigned int irq);
extern char irq_entries_start[];
#ifdef CONFIG_TRACING
#define trace_irq_entries_start irq_entries_start
#endif
#define VECTOR_UNUSED NULL
#define VECTOR_RETRIGGERED ((void *)~0UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
#endif /* _ASM_X86_HW_IRQ_H */