From bd87f1f028ddaad45d4a9a3621dfe688c840ba41 Mon Sep 17 00:00:00 2001 From: Barry Kasindorf Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 1/9] oprofile: op_model_athlon.c support for AMD family 10h barcelona performance counters This patch is for controlling the upper 32bits of the event ctrl msrs. This includes the upper 4 bits of the event select and the Guest Only and Host Only bits This patch is necessary to make Event Based Profiling work reliably on a Family 10h processor [akpm@linux-foundation.org: checkpatch.pl fixes] Signed-off-by: Barry Kasindorf Signed-off-by: Robert Richter Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/oprofile/op_model_athlon.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 3057a19e4641..c3ee43333f26 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -1,6 +1,6 @@ /** * @file op_model_athlon.h - * athlon / K7 model-specific MSR operations + * athlon / K7 / K8 / Family 10h model-specific MSR operations * * @remark Copyright 2002 OProfile authors * @remark Read the file COPYING @@ -31,12 +31,16 @@ #define CTRL_WRITE(l,h,msrs,c) do {wrmsr(msrs->controls[(c)].addr, (l), (h));} while (0) #define CTRL_SET_ACTIVE(n) (n |= (1<<22)) #define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR(x) (x &= (1<<21)) +#define CTRL_CLEAR_LO(x) (x &= (1<<21)) +#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) #define CTRL_SET_ENABLE(val) (val |= 1<<20) #define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16)) #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17)) #define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT(val, e) (val |= e) +#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) +#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) +#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) +#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) static unsigned long reset_value[NUM_COUNTERS]; @@ -70,7 +74,8 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs) if (unlikely(!CTRL_IS_RESERVED(msrs,i))) continue; CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); CTRL_WRITE(low, high, msrs, i); } @@ -89,12 +94,17 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs) CTR_WRITE(counter_config[i].count, msrs, i); CTRL_READ(low, high, msrs, i); - CTRL_CLEAR(low); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); CTRL_SET_ENABLE(low); CTRL_SET_USR(low, counter_config[i].user); CTRL_SET_KERN(low, counter_config[i].kernel); CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT(low, counter_config[i].event); + CTRL_SET_EVENT_LOW(low, counter_config[i].event); + CTRL_SET_EVENT_HIGH(high, counter_config[i].event); + CTRL_SET_HOST_ONLY(high, 0); + CTRL_SET_GUEST_ONLY(high, 0); + CTRL_WRITE(low, high, msrs, i); } else { reset_value[i] = 0; From cdc6f27d9e3c2f7ca1a3e19c6eabb1ad6a2add5d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 2/9] clockevents: fix reprogramming decision in oneshot broadcast Resolve the following regression of a choppy, almost unusable laptop: http://lkml.org/lkml/2007/12/7/299 http://bugzilla.kernel.org/show_bug.cgi?id=9525 A previous version of the code did the reprogramming of the broadcast device in the return from idle code. This was removed, but the logic in tick_handle_oneshot_broadcast() was kept the same. When a broadcast interrupt happens we signal the expiry to all CPUs which have an expired event. If none of the CPUs has an expired event, which can happen in dyntick mode, then we reprogram the broadcast device. We do not reprogram otherwise, but this is only correct if all CPUs, which are in the idle broadcast state have been woken up. The code ignores, that there might be pending not yet expired events on other CPUs, which are in the idle broadcast state. So the delivery of those events can be delayed for quite a time. Change the tick_handle_oneshot_broadcast() function to check for CPUs, which are in broadcast state and are not woken up by the current event, and enforce the rearming of the broadcast device for those CPUs. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 56 ++++++++++++++---------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index aa82d7bf478a..5b86698faa0b 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -383,33 +383,6 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) return 0; } -/* - * Reprogram the broadcast device: - * - * Called with tick_broadcast_lock held and interrupts disabled. - */ -static int tick_broadcast_reprogram(void) -{ - ktime_t expires = { .tv64 = KTIME_MAX }; - struct tick_device *td; - int cpu; - - /* - * Find the event which expires next: - */ - for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; - cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { - td = &per_cpu(tick_cpu_device, cpu); - if (td->evtdev->next_event.tv64 < expires.tv64) - expires = td->evtdev->next_event; - } - - if (expires.tv64 == KTIME_MAX) - return 0; - - return tick_broadcast_set_event(expires, 0); -} - /* * Handle oneshot mode broadcasting */ @@ -417,12 +390,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) { struct tick_device *td; cpumask_t mask; - ktime_t now; + ktime_t now, next_event; int cpu; spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; + next_event.tv64 = KTIME_MAX; mask = CPU_MASK_NONE; now = ktime_get(); /* Find all expired events */ @@ -431,19 +405,31 @@ again: td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event.tv64 <= now.tv64) cpu_set(cpu, mask); + else if (td->evtdev->next_event.tv64 < next_event.tv64) + next_event.tv64 = td->evtdev->next_event.tv64; } /* - * Wakeup the cpus which have an expired event. The broadcast - * device is reprogrammed in the return from idle code. + * Wakeup the cpus which have an expired event. */ - if (!tick_do_broadcast(mask)) { + tick_do_broadcast(mask); + + /* + * Two reasons for reprogram: + * + * - The global event did not expire any CPU local + * events. This happens in dyntick mode, as the maximum PIT + * delta is quite small. + * + * - There are pending events on sleeping CPUs which were not + * in the event mask + */ + if (next_event.tv64 != KTIME_MAX) { /* - * The global event did not expire any CPU local - * events. This happens in dyntick mode, as the - * maximum PIT delta is quite small. + * Rearm the broadcast device. If event expired, + * repeat the above */ - if (tick_broadcast_reprogram()) + if (tick_broadcast_set_event(next_event, 0)) goto again; } spin_unlock(&tick_broadcast_lock); From b019e57321f3e006c0ec7a54f13efc377bcb6451 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 3/9] genirq: add unlocked version of set_irq_handler() Add unlocked version for use by irq_chip.set_type handlers which may wish to change handler to level or edge handler when IRQ type is changed. The normal set_irq_handler() call cannot be used because it tries to take irq_desc.lock which is already held when the irq_chip.set_type hook is called. Signed-off-by: Kevin Hilman Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index efc88538b2ba..4669be080617 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -339,6 +339,13 @@ extern void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name); +/* caller has locked the irq_desc and both params are valid */ +static inline void __set_irq_handler_unlocked(int irq, + irq_flow_handler_t handler) +{ + irq_desc[irq].handle_irq = handler; +} + /* * Set a highlevel flow handler for a given IRQ: */ From b4be625852618636a6b54908c4f9d90fb29dc549 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 4/9] timer: kernel/timer.c section fixes This patch fixes the following section mismatches with CONFIG_HOTPLUG=n, CONFIG_HOTPLUG_CPU=y: ... WARNING: vmlinux.o(.text+0x41cd3): Section mismatch: reference to .init.data:tvec_base_done.22610 (between 'timer_cpu_notify' and 'run_timer_softirq') WARNING: vmlinux.o(.text+0x41d67): Section mismatch: reference to .init.data:tvec_base_done.22610 (between 'timer_cpu_notify' and 'run_timer_softirq') ... Signed-off-by: Adrian Bunk Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/timer.c b/kernel/timer.c index a05817c021d6..d4527dcef1af 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1219,11 +1219,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) */ static struct lock_class_key base_lock_keys[NR_CPUS]; -static int __devinit init_timers_cpu(int cpu) +static int __cpuinit init_timers_cpu(int cpu) { int j; tvec_base_t *base; - static char __devinitdata tvec_base_done[NR_CPUS]; + static char __cpuinitdata tvec_base_done[NR_CPUS]; if (!tvec_base_done[cpu]) { static char boot_done; From 29b6cd794e73eea7600541d06288a09861ffecb0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 5/9] x86: jprobe bugfix jprobe for x86-64 may cause kernel page fault when the jprobe_return() is called from incorrect function. - Use jprobe_saved_regs instead getting it from stack. (Especially on x86-64, it may get incorrect data, because pt_regs can not be get by using container_of(rsp)) - Change the type of stack pointer to unsigned long *. Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/kprobes_32.c | 4 +--- arch/x86/kernel/kprobes_64.c | 6 ++---- include/asm-x86/kprobes_32.h | 2 +- include/asm-x86/kprobes_64.h | 2 +- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c index d87a523070d1..3a020f79f82b 100644 --- a/arch/x86/kernel/kprobes_32.c +++ b/arch/x86/kernel/kprobes_32.c @@ -727,9 +727,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { if (®s->esp != kcb->jprobe_saved_esp) { - struct pt_regs *saved_regs = - container_of(kcb->jprobe_saved_esp, - struct pt_regs, esp); + struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; printk("current esp %p does not match saved esp %p\n", ®s->esp, kcb->jprobe_saved_esp); printk("Saved registers for jprobe %p\n", jp); diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c index 0c467644589c..a575059fb420 100644 --- a/arch/x86/kernel/kprobes_64.c +++ b/arch/x86/kernel/kprobes_64.c @@ -716,10 +716,8 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) struct jprobe *jp = container_of(p, struct jprobe, kp); if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if ((long *)regs->rsp != kcb->jprobe_saved_rsp) { - struct pt_regs *saved_regs = - container_of(kcb->jprobe_saved_rsp, - struct pt_regs, rsp); + if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) { + struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; printk("current rsp %p does not match saved rsp %p\n", (long *)regs->rsp, kcb->jprobe_saved_rsp); printk("Saved registers for jprobe %p\n", jp); diff --git a/include/asm-x86/kprobes_32.h b/include/asm-x86/kprobes_32.h index b772d5b38685..9fe8f3bddfd5 100644 --- a/include/asm-x86/kprobes_32.h +++ b/include/asm-x86/kprobes_32.h @@ -73,7 +73,7 @@ struct kprobe_ctlblk { unsigned long kprobe_status; unsigned long kprobe_old_eflags; unsigned long kprobe_saved_eflags; - long *jprobe_saved_esp; + unsigned long *jprobe_saved_esp; struct pt_regs jprobe_saved_regs; kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; struct prev_kprobe prev_kprobe; diff --git a/include/asm-x86/kprobes_64.h b/include/asm-x86/kprobes_64.h index 53f4d8507354..743d76218fc9 100644 --- a/include/asm-x86/kprobes_64.h +++ b/include/asm-x86/kprobes_64.h @@ -66,7 +66,7 @@ struct kprobe_ctlblk { unsigned long kprobe_status; unsigned long kprobe_old_rflags; unsigned long kprobe_saved_rflags; - long *jprobe_saved_rsp; + unsigned long *jprobe_saved_rsp; struct pt_regs jprobe_saved_regs; kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; struct prev_kprobe prev_kprobe; From 0b0122faf4833548072d23f3c3063c23bc289746 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 6/9] x86: kprobes bugfix Kprobes for x86-64 may cause a kernel crash if it inserted on "iret" instruction. "call absolute" is invalid on x86-64, so we don't need treat it. - Change the processing order as same as x86-32. - Add "iret"(0xcf) case. - Remove next_rip local variable. Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/kprobes_64.c | 41 ++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c index a575059fb420..5df19a9f9239 100644 --- a/arch/x86/kernel/kprobes_64.c +++ b/arch/x86/kernel/kprobes_64.c @@ -485,7 +485,6 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { unsigned long *tos = (unsigned long *)regs->rsp; - unsigned long next_rip = 0; unsigned long copy_rip = (unsigned long)p->ainsn.insn; unsigned long orig_rip = (unsigned long)p->addr; kprobe_opcode_t *insn = p->ainsn.insn; @@ -494,46 +493,42 @@ static void __kprobes resume_execution(struct kprobe *p, if (*insn >= 0x40 && *insn <= 0x4f) insn++; + regs->eflags &= ~TF_MASK; switch (*insn) { - case 0x9c: /* pushfl */ + case 0x9c: /* pushfl */ *tos &= ~(TF_MASK | IF_MASK); *tos |= kcb->kprobe_old_rflags; break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: + case 0xc2: /* iret/ret/lret */ + case 0xc3: case 0xca: - regs->eflags &= ~TF_MASK; - /* rip is already adjusted, no more changes required*/ - return; - case 0xe8: /* call relative - Fix return addr */ + case 0xcb: + case 0xcf: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + goto no_change; + case 0xe8: /* call relative - Fix return addr */ *tos = orig_rip + (*tos - copy_rip); break; case 0xff: if ((insn[1] & 0x30) == 0x10) { /* call absolute, indirect */ - /* Fix return addr; rip is correct. */ - next_rip = regs->rip; + /* Fix return addr; ip is correct. */ *tos = orig_rip + (*tos - copy_rip); + goto no_change; } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* rip is correct. */ - next_rip = regs->rip; + /* ip is correct. */ + goto no_change; } - break; - case 0xea: /* jmp absolute -- rip is correct */ - next_rip = regs->rip; - break; default: break; } - regs->eflags &= ~TF_MASK; - if (next_rip) { - regs->rip = next_rip; - } else { - regs->rip = orig_rip + (regs->rip - copy_rip); - } + regs->rip = orig_rip + (regs->rip - copy_rip); +no_change: + + return; } int __kprobes post_kprobe_handler(struct pt_regs *regs) From 213fde71024223abcdd7d9e5349d1ea2679227b5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 7/9] x86: also define AT_VECTOR_SIZE_ARCH The patch introducing this left out 64-bit x86 despite it also having extra entries. this solves Xen guest troubles. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/system_64.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/asm-x86/system_64.h b/include/asm-x86/system_64.h index 4cb23848d460..6e9e4841a2da 100644 --- a/include/asm-x86/system_64.h +++ b/include/asm-x86/system_64.h @@ -7,6 +7,13 @@ #ifdef __KERNEL__ +/* entries in ARCH_DLINFO: */ +#ifdef CONFIG_IA32_EMULATION +# define AT_VECTOR_SIZE_ARCH 2 +#else +# define AT_VECTOR_SIZE_ARCH 1 +#endif + #define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" #define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" From 971e5b35fb02c5088d49e6c024aab73582a35b71 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 8/9] genirq: revert lazy irq disable for simple irqs In commit 76d2160147f43f982dfe881404cfde9fd0a9da21 lazy irq disabling was implemented, and the simple irq handler had a masking set to it. Remy Bohmer discovered that some devices in the ARM architecture would trigger the mask, but never unmask it. His patch to do the unmasking was questioned by Russell King about masking simple irqs to begin with. Looking further, it was discovered that the problems Remy was seeing was due to improper use of the simple handler by devices, and he later submitted patches to fix those. But the issue that was uncovered was that the simple handler should never mask. This patch reverts the masking in the simple handler. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Acked-by: Russell King --- kernel/irq/chip.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9b5dff6b3f6a..44019ce30a14 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -297,18 +297,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_cpu(cpu).irqs[irq]++; action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { - if (desc->chip->mask) - desc->chip->mask(irq); - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - desc->status |= IRQ_PENDING; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) goto out_unlock; - } - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); desc->status |= IRQ_INPROGRESS; spin_unlock(&desc->lock); From 4aae07025265151e3f7041dfbf0f529e122de1d8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Dec 2007 18:05:58 +0100 Subject: [PATCH 9/9] x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_apic_32.c | 16 ++++++++++++---- arch/x86/kernel/io_apic_64.c | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 6cf27319a91c..c3a565bba106 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1882,13 +1882,16 @@ __setup("no_timer_check", notimercheck); static int __init timer_irq_works(void) { unsigned long t1 = jiffies; + unsigned long flags; if (no_timer_check) return 1; + local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); + local_irq_restore(flags); /* * Expect a few ticks at least, to be sure some possible @@ -2167,6 +2170,9 @@ static inline void __init check_timer(void) int apic1, pin1, apic2, pin2; int vector; unsigned int ver; + unsigned long flags; + + local_irq_save(flags); ver = apic_read(APIC_LVR); ver = GET_APIC_VERSION(ver); @@ -2219,7 +2225,7 @@ static inline void __init check_timer(void) } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); - return; + goto out; } clear_IO_APIC_pin(apic1, pin1); printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " @@ -2242,7 +2248,7 @@ static inline void __init check_timer(void) if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } - return; + goto out; } /* * Cleanup, just in case ... @@ -2266,7 +2272,7 @@ static inline void __init check_timer(void) if (timer_irq_works()) { printk(" works.\n"); - return; + goto out; } apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); printk(" failed.\n"); @@ -2282,11 +2288,13 @@ static inline void __init check_timer(void) if (timer_irq_works()) { printk(" works.\n"); - return; + goto out; } printk(" failed :(.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option"); +out: + local_irq_restore(flags); } /* diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 435a8c9b55f8..cbac1670c7c3 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1281,10 +1281,13 @@ void disable_IO_APIC(void) static int __init timer_irq_works(void) { unsigned long t1 = jiffies; + unsigned long flags; + local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); + local_irq_restore(flags); /* * Expect a few ticks at least, to be sure some possible @@ -1655,6 +1658,9 @@ static inline void check_timer(void) { struct irq_cfg *cfg = irq_cfg + 0; int apic1, pin1, apic2, pin2; + unsigned long flags; + + local_irq_save(flags); /* * get/set the timer IRQ vector: @@ -1696,7 +1702,7 @@ static inline void check_timer(void) } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); - return; + goto out; } clear_IO_APIC_pin(apic1, pin1); apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " @@ -1718,7 +1724,7 @@ static inline void check_timer(void) if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } - return; + goto out; } /* * Cleanup, just in case ... @@ -1741,7 +1747,7 @@ static inline void check_timer(void) if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - return; + goto out; } apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_VERBOSE," failed.\n"); @@ -1756,10 +1762,12 @@ static inline void check_timer(void) if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - return; + goto out; } apic_printk(APIC_VERBOSE," failed :(.\n"); panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); +out: + local_irq_restore(flags); } static int __init notimercheck(char *s)