From 11249e73992981e31fd50e7231da24fad68e3320 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 5 Feb 2015 12:39:36 +0100 Subject: [PATCH 1/2] sb_edac: Fix detection on SNB machines d0585cd815fa ("sb_edac: Claim a different PCI device") changed the probing of sb_edac to look for PCI device 0x3ca0: 3f:0e.0 System peripheral: Intel Corporation Xeon E5/Core i7 Processor Home Agent (rev 07) 00: 86 80 a0 3c 00 00 00 00 07 00 80 08 00 00 80 00 ... but we're matching for 0x3ca8, i.e. PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA in sbridge_probe() therefore the probing fails. Changing it to probe for 0x3ca0 (PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0), .i.e., the 14.0 device, fixes the issue and driver loads successfully again: [ 2449.013120] EDAC DEBUG: sbridge_init: [ 2449.017029] EDAC sbridge: Seeking for: PCI ID 8086:3ca0 [ 2449.022368] EDAC DEBUG: sbridge_get_onedevice: Detected 8086:3ca0 [ 2449.028498] EDAC sbridge: Seeking for: PCI ID 8086:3ca0 [ 2449.033768] EDAC sbridge: Seeking for: PCI ID 8086:3ca8 [ 2449.039028] EDAC DEBUG: sbridge_get_onedevice: Detected 8086:3ca8 [ 2449.045155] EDAC sbridge: Seeking for: PCI ID 8086:3ca8 ... Add a debug printk while at it to be able to catch the failure in the future and dump driver version on successful load. Fixes: d0585cd815fa ("sb_edac: Claim a different PCI device") Cc: stable@vger.kernel.org # 3.18 Acked-by: Aristeu Rozanski Cc: Tony Luck Acked-by: Andy Lutomirski Acked-by: Mauro Carvalho Chehab Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 63aa6730e89e..1acf57ba4c86 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -2447,7 +2447,7 @@ static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id) rc = sbridge_get_all_devices(&num_mc, pci_dev_descr_ibridge_table); type = IVY_BRIDGE; break; - case PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_TA: + case PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0: rc = sbridge_get_all_devices(&num_mc, pci_dev_descr_sbridge_table); type = SANDY_BRIDGE; break; @@ -2460,8 +2460,11 @@ static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id) type = BROADWELL; break; } - if (unlikely(rc < 0)) + if (unlikely(rc < 0)) { + edac_dbg(0, "couldn't get all devices for 0x%x\n", pdev->device); goto fail0; + } + mc = 0; list_for_each_entry(sbridge_dev, &sbridge_edac_list, list) { @@ -2474,7 +2477,7 @@ static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto fail1; } - sbridge_printk(KERN_INFO, "Driver loaded.\n"); + sbridge_printk(KERN_INFO, "%s\n", SBRIDGE_REVISION); mutex_unlock(&sbridge_edac_lock); return 0; From 0c510cc83bdbaac8406f4f7caef34f4da0ba35ea Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 17 Feb 2015 11:34:38 +0800 Subject: [PATCH 2/2] EDAC, amd64_edac: Prevent OOPS with >16 memory controllers When DRAM errors occur on memory controllers after EDAC_MAX_MCS (16), the kernel fatally dereferences unallocated structures, see splat below; this occurs on at least NumaConnect systems. Fix by checking if a memory controller info structure was found. BUG: unable to handle kernel NULL pointer dereference at 0000000000000320 IP: [] decode_bus_error+0x2f/0x2b0 PGD 2f8b5a3067 PUD 2f8b5a2067 PMD 0 Oops: 0000 [#2] SMP Modules linked in: CPU: 224 PID: 11930 Comm: stream_c.exe.gn Tainted: G D 3.19.0 #1 Hardware name: Supermicro H8QGL/H8QGL, BIOS 3.5b 01/28/2015 task: ffff8807dbfb8c00 ti: ffff8807dd16c000 task.ti: ffff8807dd16c000 RIP: 0010:[] [] decode_bus_error+0x2f/0x2b0 RSP: 0000:ffff8907dfc03c48 EFLAGS: 00010297 RAX: 0000000000000001 RBX: 9c67400010080a13 RCX: 0000000000001dc6 RDX: 000000001dc61dc6 RSI: ffff8907dfc03df0 RDI: 000000000000001c RBP: ffff8907dfc03ce8 R08: 0000000000000000 R09: 0000000000000022 R10: ffff891fffa30380 R11: 00000000001cfc90 R12: 0000000000000008 R13: 0000000000000000 R14: 000000000000001c R15: 00009c6740001000 FS: 00007fa97ee18700(0000) GS:ffff8907dfc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000320 CR3: 0000003f889b8000 CR4: 00000000000407e0 Stack: 0000000000000000 ffff8907dfc03df0 0000000000000008 9c67400010080a13 000000000000001c 00009c6740001000 ffff8907dfc03c88 ffffffff810e4f9a ffff8907dfc03ce8 ffffffff81b375b9 0000000000000000 0000000000000010 Call Trace: ? vprintk_default ? printk amd_decode_mce notifier_call_chain atomic_notifier_call_chain mce_log machine_check_poll mce_timer_fn ? mce_cpu_restart call_timer_fn.isra.29 run_timer_softirq __do_softirq irq_exit smp_apic_timer_interrupt apic_timer_interrupt ? down_read_trylock __do_page_fault ? __schedule do_page_fault page_fault Signed-off-by: Daniel J Blueman Link: http://lkml.kernel.org/r/1424144078-24589-1-git-send-email-daniel@numascale.com Cc: stable@vger.kernel.org [ Boris: massage commit message ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 17638d7cf5c2..5907c1718f8c 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2174,14 +2174,20 @@ static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err, static inline void decode_bus_error(int node_id, struct mce *m) { - struct mem_ctl_info *mci = mcis[node_id]; - struct amd64_pvt *pvt = mci->pvt_info; + struct mem_ctl_info *mci; + struct amd64_pvt *pvt; u8 ecc_type = (m->status >> 45) & 0x3; u8 xec = XEC(m->status, 0x1f); u16 ec = EC(m->status); u64 sys_addr; struct err_info err; + mci = edac_mc_find(node_id); + if (!mci) + return; + + pvt = mci->pvt_info; + /* Bail out early if this was an 'observed' error */ if (PP(ec) == NBSL_PP_OBS) return;