From 876bb331e26b970c2d8caea2c1d1209fdae953d0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 10 Sep 2012 15:52:27 +0200 Subject: [PATCH 01/24] EDAC: Respect operational state in edac_pci.c Currently, we unconditionally enable PCI polling and we don't look at the edac_op_state module parameter. Make this dependent on the parameter setting supplied on the command line. Signed-off-by: Borislav Petkov --- drivers/edac/edac_pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index ee87ef972ead..dd370f92ace3 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -470,7 +470,8 @@ struct edac_pci_ctl_info *edac_pci_create_generic_ctl(struct device *dev, pci->mod_name = mod_name; pci->ctl_name = EDAC_PCI_GENCTL_NAME; - pci->edac_check = edac_pci_generic_check; + if (edac_op_state == EDAC_OPSTATE_POLL) + pci->edac_check = edac_pci_generic_check; pdata->edac_idx = edac_pci_idx++; From 37929874d439d79e8f6128f400f63069ee1bbf3e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 10 Sep 2012 16:50:54 +0200 Subject: [PATCH 02/24] EDAC: Boundary-check edac_debug_level Only levels [0:4] are allowed so enforce that. Also, while at it, massage Kconfig text and add valid debug levels range to the module parameter description. Signed-off-by: Borislav Petkov --- drivers/edac/Kconfig | 8 ++++---- drivers/edac/edac_module.c | 27 ++++++++++++++++++++------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 409b92b8d346..bb82d6be793c 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -42,10 +42,10 @@ config EDAC_LEGACY_SYSFS config EDAC_DEBUG bool "Debugging" help - This turns on debugging information for the entire EDAC - sub-system. You can insert module with "debug_level=x", current - there're four debug levels (x=0,1,2,3 from low to high). - Usually you should select 'N'. + This turns on debugging information for the entire EDAC subsystem. + You do so by inserting edac_module with "edac_debug_level=x." Valid + levels are 0-4 (from low to high) and by default it is set to 2. + Usually you should select 'N' here. config EDAC_DECODE_MCE tristate "Decode MCEs in human-readable form (only on AMD for now)" diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c index 58a28d838f37..12c951a2c33d 100644 --- a/drivers/edac/edac_module.c +++ b/drivers/edac/edac_module.c @@ -18,9 +18,29 @@ #define EDAC_VERSION "Ver: 3.0.0" #ifdef CONFIG_EDAC_DEBUG + +static int edac_set_debug_level(const char *buf, struct kernel_param *kp) +{ + unsigned long val; + int ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 4) + return -EINVAL; + + return param_set_int(buf, kp); +} + /* Values of 0 to 4 will generate output */ int edac_debug_level = 2; EXPORT_SYMBOL_GPL(edac_debug_level); + +module_param_call(edac_debug_level, edac_set_debug_level, param_get_int, + &edac_debug_level, 0644); +MODULE_PARM_DESC(edac_debug_level, "EDAC debug level: [0-4], default: 2"); #endif /* scope is to module level only */ @@ -132,10 +152,3 @@ module_exit(edac_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Doug Thompson www.softwarebitmaker.com, et al"); MODULE_DESCRIPTION("Core library routines for EDAC reporting"); - -/* refer to *_sysfs.c files for parameters that are exported via sysfs */ - -#ifdef CONFIG_EDAC_DEBUG -module_param(edac_debug_level, int, 0644); -MODULE_PARM_DESC(edac_debug_level, "Debug level"); -#endif From 4da1b7bfe7699881c761d71b5e299a65bce48ab2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 10 Sep 2012 17:57:44 +0200 Subject: [PATCH 03/24] EDAC: Remove useless assignment of error type The tracepoint decodes the error type later anyway so remove a useless assignment to the temporary p which gets overwritten later anyway. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 90f0b730e9bb..d5074222db69 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -1093,10 +1093,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, */ for (i = 0; i < mci->n_layers; i++) { if (pos[i] >= (int)mci->layers[i].size) { - if (type == HW_EVENT_ERR_CORRECTED) - p = "CE"; - else - p = "UE"; edac_mc_printk(mci, KERN_ERR, "INTERNAL ERROR: %s value is out of range (%d >= %d)\n", @@ -1128,6 +1124,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, grain = 0; p = label; *p = '\0'; + for (i = 0; i < mci->tot_dimms; i++) { struct dimm_info *dimm = mci->dimms[i]; @@ -1195,6 +1192,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, /* Fill the RAM location data */ p = location; + for (i = 0; i < mci->n_layers; i++) { if (pos[i] < 0) continue; @@ -1207,7 +1205,6 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type, *(p - 1) = '\0'; /* Report the error via the trace interface */ - grain_bits = fls_long(grain) + 1; trace_mc_event(type, msg, label, error_count, mci->mc_idx, top_layer, mid_layer, low_layer, From f430d5707aa47af8669bbc0083a79e7d780908b2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 10 Sep 2012 18:36:09 +0200 Subject: [PATCH 04/24] EDAC: Handle empty msg strings when reporting errors A reported error could look like this [ 226.178315] EDAC MC0: 1 CE on mc#0csrow#0channel#0 (csrow:0 channel:0 page:0x427c0d offset:0xde0 grain:0 syndrome:0x1c6) with two spaces back-to-back due to the msg argument of edac_mc_handle_error being passed on empty by the specific drivers. Handle that. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc.c | 44 ++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index d5074222db69..39c75246c2ae 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -966,20 +966,22 @@ static void edac_ce_error(struct mem_ctl_info *mci, long grain) { unsigned long remapped_page; + char *msg_aux = ""; + + if (*msg) + msg_aux = " "; if (edac_mc_get_log_ce()) { if (other_detail && *other_detail) edac_mc_printk(mci, KERN_WARNING, - "%d CE %s on %s (%s %s - %s)\n", - error_count, - msg, label, location, - detail, other_detail); + "%d CE %s%son %s (%s %s - %s)\n", + error_count, msg, msg_aux, label, + location, detail, other_detail); else edac_mc_printk(mci, KERN_WARNING, - "%d CE %s on %s (%s %s)\n", - error_count, - msg, label, location, - detail); + "%d CE %s%son %s (%s %s)\n", + error_count, msg, msg_aux, label, + location, detail); } edac_inc_ce_error(mci, enable_per_layer_report, pos, error_count); @@ -1014,27 +1016,31 @@ static void edac_ue_error(struct mem_ctl_info *mci, const char *other_detail, const bool enable_per_layer_report) { + char *msg_aux = ""; + + if (*msg) + msg_aux = " "; + if (edac_mc_get_log_ue()) { if (other_detail && *other_detail) edac_mc_printk(mci, KERN_WARNING, - "%d UE %s on %s (%s %s - %s)\n", - error_count, - msg, label, location, detail, - other_detail); + "%d UE %s%son %s (%s %s - %s)\n", + error_count, msg, msg_aux, label, + location, detail, other_detail); else edac_mc_printk(mci, KERN_WARNING, - "%d UE %s on %s (%s %s)\n", - error_count, - msg, label, location, detail); + "%d UE %s%son %s (%s %s)\n", + error_count, msg, msg_aux, label, + location, detail); } if (edac_mc_get_panic_on_ue()) { if (other_detail && *other_detail) - panic("UE %s on %s (%s%s - %s)\n", - msg, label, location, detail, other_detail); + panic("UE %s%son %s (%s%s - %s)\n", + msg, msg_aux, label, location, detail, other_detail); else - panic("UE %s on %s (%s%s)\n", - msg, label, location, detail); + panic("UE %s%son %s (%s%s)\n", + msg, msg_aux, label, location, detail); } edac_inc_ue_error(mci, enable_per_layer_report, pos, error_count); From 1f31677e0d5492ce8776a39c9dcda4a0d75c7da1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 10 Aug 2012 12:50:50 +0200 Subject: [PATCH 05/24] amd64_edac: Small fixlets and cleanups amd64_get_dram_hole_info: remove local variable 'base'. sys_addr_to_dram_addr: do not clear local variable 'ret'. Also, sanitize constants formatting. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index cc8e7c78a23c..5960a8a07fd2 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -423,7 +423,6 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size) { struct amd64_pvt *pvt = mci->pvt_info; - u64 base; /* only revE and later have the DRAM Hole Address Register */ if (boot_cpu_data.x86 == 0xf && pvt->ext_model < K8_REV_E) { @@ -462,10 +461,8 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, * addresses in the hole so that they start at 0x100000000. */ - base = dhar_base(pvt); - - *hole_base = base; - *hole_size = (0x1ull << 32) - base; + *hole_base = dhar_base(pvt); + *hole_size = (1ULL << 32) - *hole_base; if (boot_cpu_data.x86 > 0xf) *hole_offset = f10_dhar_offset(pvt); @@ -513,15 +510,15 @@ static u64 sys_addr_to_dram_addr(struct mem_ctl_info *mci, u64 sys_addr) { struct amd64_pvt *pvt = mci->pvt_info; u64 dram_base, hole_base, hole_offset, hole_size, dram_addr; - int ret = 0; + int ret; dram_base = get_dram_base(pvt, pvt->mc_node_id); ret = amd64_get_dram_hole_info(mci, &hole_base, &hole_offset, &hole_size); if (!ret) { - if ((sys_addr >= (1ull << 32)) && - (sys_addr < ((1ull << 32) + hole_size))) { + if ((sys_addr >= (1ULL << 32)) && + (sys_addr < ((1ULL << 32) + hole_size))) { /* use DHAR to translate SysAddr to DramAddr */ dram_addr = sys_addr - hole_offset; From 6e71a870b8ff2c1e2d89e5ea27a38cea39cefa3d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 9 Aug 2012 18:23:53 +0200 Subject: [PATCH 06/24] amd64_edac: Cleanup error injection code Invert kstrtoul return value testing and win one indentation level. Also, shorten up macro names so that the lines can fit into 80 cols. No functional change. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.h | 17 +++--- drivers/edac/amd64_edac_inj.c | 112 ++++++++++++++++------------------ 2 files changed, 60 insertions(+), 69 deletions(-) diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 8c4139647efc..19a12a4fbf45 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -267,18 +267,19 @@ #define online_spare_bad_dramcs(pvt, c) (((pvt)->online_spare >> (4 + 4 * (c))) & 0x7) #define F10_NB_ARRAY_ADDR 0xB8 -#define F10_NB_ARRAY_DRAM_ECC BIT(31) +#define F10_NB_ARRAY_DRAM BIT(31) /* Bits [2:1] are used to select 16-byte section within a 64-byte cacheline */ -#define SET_NB_ARRAY_ADDRESS(section) (((section) & 0x3) << 1) +#define SET_NB_ARRAY_ADDR(section) (((section) & 0x3) << 1) #define F10_NB_ARRAY_DATA 0xBC -#define SET_NB_DRAM_INJECTION_WRITE(word, bits) \ - (BIT(((word) & 0xF) + 20) | \ - BIT(17) | bits) -#define SET_NB_DRAM_INJECTION_READ(word, bits) \ - (BIT(((word) & 0xF) + 20) | \ - BIT(16) | bits) +#define SET_NB_DRAM_INJECTION_WRITE(inj) \ + (BIT(((inj.word) & 0xF) + 20) | \ + BIT(17) | inj.bit_map) +#define SET_NB_DRAM_INJECTION_READ(inj) \ + (BIT(((inj.word) & 0xF) + 20) | \ + BIT(16) | inj.bit_map) + #define NBCAP 0xE8 #define NBCAP_CHIPKILL BIT(4) diff --git a/drivers/edac/amd64_edac_inj.c b/drivers/edac/amd64_edac_inj.c index 53d972e00dfb..8977e2fa61da 100644 --- a/drivers/edac/amd64_edac_inj.c +++ b/drivers/edac/amd64_edac_inj.c @@ -22,20 +22,19 @@ static ssize_t amd64_inject_section_store(struct device *dev, struct mem_ctl_info *mci = to_mci(dev); struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; - int ret = 0; + int ret; ret = strict_strtoul(data, 10, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - if (value > 3) { - amd64_warn("%s: invalid section 0x%lx\n", __func__, value); - return -EINVAL; - } - - pvt->injection.section = (u32) value; - return count; + if (value > 3) { + amd64_warn("%s: invalid section 0x%lx\n", __func__, value); + return -EINVAL; } - return ret; + + pvt->injection.section = (u32) value; + return count; } static ssize_t amd64_inject_word_show(struct device *dev, @@ -60,20 +59,19 @@ static ssize_t amd64_inject_word_store(struct device *dev, struct mem_ctl_info *mci = to_mci(dev); struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; - int ret = 0; + int ret; ret = strict_strtoul(data, 10, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - if (value > 8) { - amd64_warn("%s: invalid word 0x%lx\n", __func__, value); - return -EINVAL; - } - - pvt->injection.word = (u32) value; - return count; + if (value > 8) { + amd64_warn("%s: invalid word 0x%lx\n", __func__, value); + return -EINVAL; } - return ret; + + pvt->injection.word = (u32) value; + return count; } static ssize_t amd64_inject_ecc_vector_show(struct device *dev, @@ -97,21 +95,19 @@ static ssize_t amd64_inject_ecc_vector_store(struct device *dev, struct mem_ctl_info *mci = to_mci(dev); struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; - int ret = 0; + int ret; ret = strict_strtoul(data, 16, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - if (value & 0xFFFF0000) { - amd64_warn("%s: invalid EccVector: 0x%lx\n", - __func__, value); - return -EINVAL; - } - - pvt->injection.bit_map = (u32) value; - return count; + if (value & 0xFFFF0000) { + amd64_warn("%s: invalid EccVector: 0x%lx\n", __func__, value); + return -EINVAL; } - return ret; + + pvt->injection.bit_map = (u32) value; + return count; } /* @@ -126,28 +122,25 @@ static ssize_t amd64_inject_read_store(struct device *dev, struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; u32 section, word_bits; - int ret = 0; + int ret; ret = strict_strtoul(data, 10, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - /* Form value to choose 16-byte section of cacheline */ - section = F10_NB_ARRAY_DRAM_ECC | - SET_NB_ARRAY_ADDRESS(pvt->injection.section); - amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section); + /* Form value to choose 16-byte section of cacheline */ + section = F10_NB_ARRAY_DRAM | SET_NB_ARRAY_ADDR(pvt->injection.section); - word_bits = SET_NB_DRAM_INJECTION_READ(pvt->injection.word, - pvt->injection.bit_map); + amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section); - /* Issue 'word' and 'bit' along with the READ request */ - amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); + word_bits = SET_NB_DRAM_INJECTION_READ(pvt->injection); - edac_dbg(0, "section=0x%x word_bits=0x%x\n", - section, word_bits); + /* Issue 'word' and 'bit' along with the READ request */ + amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); - return count; - } - return ret; + edac_dbg(0, "section=0x%x word_bits=0x%x\n", section, word_bits); + + return count; } /* @@ -162,28 +155,25 @@ static ssize_t amd64_inject_write_store(struct device *dev, struct amd64_pvt *pvt = mci->pvt_info; unsigned long value; u32 section, word_bits; - int ret = 0; + int ret; ret = strict_strtoul(data, 10, &value); - if (ret != -EINVAL) { + if (ret < 0) + return ret; - /* Form value to choose 16-byte section of cacheline */ - section = F10_NB_ARRAY_DRAM_ECC | - SET_NB_ARRAY_ADDRESS(pvt->injection.section); - amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section); + /* Form value to choose 16-byte section of cacheline */ + section = F10_NB_ARRAY_DRAM | SET_NB_ARRAY_ADDR(pvt->injection.section); - word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection.word, - pvt->injection.bit_map); + amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_ADDR, section); - /* Issue 'word' and 'bit' along with the READ request */ - amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); + word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection); - edac_dbg(0, "section=0x%x word_bits=0x%x\n", - section, word_bits); + /* Issue 'word' and 'bit' along with the READ request */ + amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); - return count; - } - return ret; + edac_dbg(0, "section=0x%x word_bits=0x%x\n", section, word_bits); + + return count; } /* From 66fed2d464157eb20c37738d75b281458dfc2cab Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 9 Aug 2012 18:41:07 +0200 Subject: [PATCH 07/24] amd64_edac: Improve error injection When injecting DRAM ECC errors over the F3xB[8,C] interface, the machine does this by injecting the error in the next non-cached access. This takes relatively long time on a normal system so that in order for us to expedite it, we disable the caches around the injection. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 10 +++++----- drivers/edac/amd64_edac.h | 23 +++++++++++++++++++---- drivers/edac/amd64_edac_inj.c | 18 +++++++++++++++++- 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5960a8a07fd2..351496af9e8d 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -60,8 +60,8 @@ struct scrubrate { { 0x00, 0UL}, /* scrubbing off */ }; -static int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, - u32 *val, const char *func) +int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, + u32 *val, const char *func) { int err = 0; @@ -1980,11 +1980,11 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, struct mce *m) { - u16 ec = EC(m->status); - u8 xec = XEC(m->status, 0x1f); u8 ecc_type = (m->status >> 45) & 0x3; + u8 xec = XEC(m->status, 0x1f); + u16 ec = EC(m->status); - /* Bail early out if this was an 'observed' error */ + /* Bail out early if this was an 'observed' error */ if (PP(ec) == NBSL_PP_OBS) return; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 19a12a4fbf45..cf7981e1f063 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -273,9 +273,10 @@ #define SET_NB_ARRAY_ADDR(section) (((section) & 0x3) << 1) #define F10_NB_ARRAY_DATA 0xBC +#define F10_NB_ARR_ECC_WR_REQ BIT(17) #define SET_NB_DRAM_INJECTION_WRITE(inj) \ (BIT(((inj.word) & 0xF) + 20) | \ - BIT(17) | inj.bit_map) + F10_NB_ARR_ECC_WR_REQ | inj.bit_map) #define SET_NB_DRAM_INJECTION_READ(inj) \ (BIT(((inj.word) & 0xF) + 20) | \ BIT(16) | inj.bit_map) @@ -306,9 +307,9 @@ enum amd_families { /* Error injection control structure */ struct error_injection { - u32 section; - u32 word; - u32 bit_map; + u32 section; + u32 word; + u32 bit_map; }; /* low and high part of PCI config space regs */ @@ -460,6 +461,8 @@ struct amd64_family_type { struct low_ops ops; }; +int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset, + u32 *val, const char *func); int __amd64_write_pci_cfg_dword(struct pci_dev *pdev, int offset, u32 val, const char *func); @@ -476,3 +479,15 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base, u64 *hole_offset, u64 *hole_size); #define to_mci(k) container_of(k, struct mem_ctl_info, dev) + +/* Injection helpers */ +static inline void disable_caches(void *dummy) +{ + write_cr0(read_cr0() | X86_CR0_CD); + wbinvd(); +} + +static inline void enable_caches(void *dummy) +{ + write_cr0(read_cr0() & ~X86_CR0_CD); +} diff --git a/drivers/edac/amd64_edac_inj.c b/drivers/edac/amd64_edac_inj.c index 8977e2fa61da..8c171fa1cb9b 100644 --- a/drivers/edac/amd64_edac_inj.c +++ b/drivers/edac/amd64_edac_inj.c @@ -153,8 +153,8 @@ static ssize_t amd64_inject_write_store(struct device *dev, { struct mem_ctl_info *mci = to_mci(dev); struct amd64_pvt *pvt = mci->pvt_info; + u32 section, word_bits, tmp; unsigned long value; - u32 section, word_bits; int ret; ret = strict_strtoul(data, 10, &value); @@ -168,9 +168,25 @@ static ssize_t amd64_inject_write_store(struct device *dev, word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection); + pr_notice_once("Don't forget to decrease MCE polling interval in\n" + "/sys/bus/machinecheck/devices/machinecheck/check_interval\n" + "so that you can get the error report faster.\n"); + + on_each_cpu(disable_caches, NULL, 1); + /* Issue 'word' and 'bit' along with the READ request */ amd64_write_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, word_bits); + retry: + /* wait until injection happens */ + amd64_read_pci_cfg(pvt->F3, F10_NB_ARRAY_DATA, &tmp); + if (tmp & F10_NB_ARR_ECC_WR_REQ) { + cpu_relax(); + goto retry; + } + + on_each_cpu(enable_caches, NULL, 1); + edac_dbg(0, "section=0x%x word_bits=0x%x\n", section, word_bits); return count; From c8d1adf092d8aa1ed947da789a99eee1130aa304 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 30 Aug 2012 14:56:57 +0200 Subject: [PATCH 08/24] amd64_edac: Do not check whether error address is valid All families report a valid error address when encountering a DRAM ECC error so no need to check it. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 351496af9e8d..987d6acd8f4e 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1900,17 +1900,6 @@ static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m) u64 sys_addr; u16 syndrome; - /* Ensure that the Error Address is VALID */ - if (!(m->status & MCI_STATUS_ADDRV)) { - amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n"); - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - 0, 0, 0, - -1, -1, -1, - "HW has no ERROR_ADDRESS available", - ""); - return; - } - sys_addr = get_error_address(m); syndrome = extract_syndrome(m->status); @@ -1929,16 +1918,6 @@ static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) log_mci = mci; - if (!(m->status & MCI_STATUS_ADDRV)) { - amd64_mc_err(mci, "HW has no ERROR_ADDRESS available\n"); - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - 0, 0, 0, - -1, -1, -1, - "HW has no ERROR_ADDRESS available", - ""); - return; - } - sys_addr = get_error_address(m); error_address_to_page_and_offset(sys_addr, &page, &offset); From 33ca0643c9a0ea50d0dc9bf0e9e9044502c7038c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 30 Aug 2012 18:01:36 +0200 Subject: [PATCH 09/24] amd64_edac: Reorganize error reporting path Rewrite CE/UE paths so that they use the same code and drop additional code duplication in handle_ue. Add a struct err_info which collects required info for the error reporting. This, in turn, helps slimming all edac_mc_handle_error() calls down to one. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 194 ++++++++++++++------------------------ drivers/edac/amd64_edac.h | 19 +++- 2 files changed, 89 insertions(+), 124 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 987d6acd8f4e..d21efb246d43 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -709,10 +709,10 @@ static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci, /* Map the Error address to a PAGE and PAGE OFFSET. */ static inline void error_address_to_page_and_offset(u64 error_address, - u32 *page, u32 *offset) + struct err_info *err) { - *page = (u32) (error_address >> PAGE_SHIFT); - *offset = ((u32) error_address) & ~PAGE_MASK; + err->page = (u32) (error_address >> PAGE_SHIFT); + err->offset = ((u32) error_address) & ~PAGE_MASK; } /* @@ -1023,59 +1023,44 @@ static void read_dram_base_limit_regs(struct amd64_pvt *pvt, unsigned range) } static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, - u16 syndrome) + struct err_info *err) { - struct mem_ctl_info *src_mci; struct amd64_pvt *pvt = mci->pvt_info; - int channel, csrow; - u32 page, offset; - error_address_to_page_and_offset(sys_addr, &page, &offset); + error_address_to_page_and_offset(sys_addr, err); /* * Find out which node the error address belongs to. This may be * different from the node that detected the error. */ - src_mci = find_mc_by_sys_addr(mci, sys_addr); - if (!src_mci) { + err->src_mci = find_mc_by_sys_addr(mci, sys_addr); + if (!err->src_mci) { amd64_mc_err(mci, "failed to map error addr 0x%lx to a node\n", (unsigned long)sys_addr); - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - -1, -1, -1, - "failed to map error addr to a node", - ""); + err->err_code = ERR_NODE; return; } /* Now map the sys_addr to a CSROW */ - csrow = sys_addr_to_csrow(src_mci, sys_addr); - if (csrow < 0) { - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - -1, -1, -1, - "failed to map error addr to a csrow", - ""); + err->csrow = sys_addr_to_csrow(err->src_mci, sys_addr); + if (err->csrow < 0) { + err->err_code = ERR_CSROW; return; } /* CHIPKILL enabled */ if (pvt->nbcfg & NBCFG_CHIPKILL) { - channel = get_channel_from_ecc_syndrome(mci, syndrome); - if (channel < 0) { + err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome); + if (err->channel < 0) { /* * Syndrome didn't map, so we don't know which of the * 2 DIMMs is in error. So we need to ID 'both' of them * as suspect. */ - amd64_mc_warn(src_mci, "unknown syndrome 0x%04x - " + amd64_mc_warn(err->src_mci, "unknown syndrome 0x%04x - " "possible error reporting race\n", - syndrome); - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - csrow, -1, -1, - "unknown syndrome - possible error reporting race", - ""); + err->syndrome); + err->err_code = ERR_CHANNEL; return; } } else { @@ -1087,13 +1072,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, * was obtained from email communication with someone at AMD. * (Wish the email was placed in this comment - norsk) */ - channel = ((sys_addr & BIT(3)) != 0); + err->channel = ((sys_addr & BIT(3)) != 0); } - - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, src_mci, 1, - page, offset, syndrome, - csrow, channel, -1, - "", ""); } static int ddr2_cs_size(unsigned i, bool dct_width) @@ -1479,7 +1459,7 @@ static u64 f1x_swap_interleaved_region(struct amd64_pvt *pvt, u64 sys_addr) /* For a given @dram_range, check if @sys_addr falls within it. */ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range, - u64 sys_addr, int *nid, int *chan_sel) + u64 sys_addr, int *chan_sel) { int cs_found = -EINVAL; u64 chan_addr; @@ -1552,15 +1532,14 @@ static int f1x_match_to_this_node(struct amd64_pvt *pvt, unsigned range, cs_found = f1x_lookup_addr_in_dct(chan_addr, node_id, channel); - if (cs_found >= 0) { - *nid = node_id; + if (cs_found >= 0) *chan_sel = channel; - } + return cs_found; } static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, - int *node, int *chan_sel) + int *chan_sel) { int cs_found = -EINVAL; unsigned range; @@ -1574,8 +1553,7 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, (get_dram_limit(pvt, range) >= sys_addr)) { cs_found = f1x_match_to_this_node(pvt, range, - sys_addr, node, - chan_sel); + sys_addr, chan_sel); if (cs_found >= 0) break; } @@ -1591,22 +1569,15 @@ static int f1x_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr, * (MCX_ADDR). */ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, - u16 syndrome) + struct err_info *err) { struct amd64_pvt *pvt = mci->pvt_info; - u32 page, offset; - int nid, csrow, chan = 0; - error_address_to_page_and_offset(sys_addr, &page, &offset); + error_address_to_page_and_offset(sys_addr, err); - csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan); - - if (csrow < 0) { - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - -1, -1, -1, - "failed to map error addr to a csrow", - ""); + err->csrow = f1x_translate_sysaddr_to_cs(pvt, sys_addr, &err->channel); + if (err->csrow < 0) { + err->err_code = ERR_CSROW; return; } @@ -1616,12 +1587,7 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, * this point. */ if (dct_ganging_enabled(pvt)) - chan = get_channel_from_ecc_syndrome(mci, syndrome); - - edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1, - page, offset, syndrome, - csrow, chan, -1, - "", ""); + err->channel = get_channel_from_ecc_syndrome(mci, err->syndrome); } /* @@ -1890,78 +1856,54 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome) return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz); } -/* - * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR - * ADDRESS and process. - */ -static void amd64_handle_ce(struct mem_ctl_info *mci, struct mce *m) +static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err, + u8 ecc_type) { - struct amd64_pvt *pvt = mci->pvt_info; - u64 sys_addr; - u16 syndrome; + enum hw_event_mc_err_type err_type; + const char *string; - sys_addr = get_error_address(m); - syndrome = extract_syndrome(m->status); - - amd64_mc_err(mci, "CE ERROR_ADDRESS= 0x%llx\n", sys_addr); - - pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, syndrome); -} - -/* Handle any Un-correctable Errors (UEs) */ -static void amd64_handle_ue(struct mem_ctl_info *mci, struct mce *m) -{ - struct mem_ctl_info *log_mci, *src_mci = NULL; - int csrow; - u64 sys_addr; - u32 page, offset; - - log_mci = mci; - - sys_addr = get_error_address(m); - error_address_to_page_and_offset(sys_addr, &page, &offset); - - /* - * Find out which node the error address belongs to. This may be - * different from the node that detected the error. - */ - src_mci = find_mc_by_sys_addr(mci, sys_addr); - if (!src_mci) { - amd64_mc_err(mci, "ERROR ADDRESS (0x%lx) NOT mapped to a MC\n", - (unsigned long)sys_addr); - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - page, offset, 0, - -1, -1, -1, - "ERROR ADDRESS NOT mapped to a MC", - ""); + if (ecc_type == 2) + err_type = HW_EVENT_ERR_CORRECTED; + else if (ecc_type == 1) + err_type = HW_EVENT_ERR_UNCORRECTED; + else { + WARN(1, "Something is rotten in the state of Denmark.\n"); return; } - log_mci = src_mci; - - csrow = sys_addr_to_csrow(log_mci, sys_addr); - if (csrow < 0) { - amd64_mc_err(mci, "ERROR_ADDRESS (0x%lx) NOT mapped to CS\n", - (unsigned long)sys_addr); - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - page, offset, 0, - -1, -1, -1, - "ERROR ADDRESS NOT mapped to CS", - ""); - } else { - edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1, - page, offset, 0, - csrow, -1, -1, - "", ""); + switch (err->err_code) { + case DECODE_OK: + string = ""; + break; + case ERR_NODE: + string = "Failed to map error addr to a node"; + break; + case ERR_CSROW: + string = "Failed to map error addr to a csrow"; + break; + case ERR_CHANNEL: + string = "unknown syndrome - possible error reporting race"; + break; + default: + string = "WTF error"; + break; } + + edac_mc_handle_error(err_type, mci, 1, + err->page, err->offset, err->syndrome, + err->csrow, err->channel, -1, + string, ""); } static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, struct mce *m) { + struct amd64_pvt *pvt = mci->pvt_info; u8 ecc_type = (m->status >> 45) & 0x3; u8 xec = XEC(m->status, 0x1f); u16 ec = EC(m->status); + u64 sys_addr; + struct err_info err; /* Bail out early if this was an 'observed' error */ if (PP(ec) == NBSL_PP_OBS) @@ -1971,10 +1913,16 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, if (xec && xec != F10_NBSL_EXT_ERR_ECC) return; + memset(&err, 0, sizeof(err)); + + sys_addr = get_error_address(m); + if (ecc_type == 2) - amd64_handle_ce(mci, m); - else if (ecc_type == 1) - amd64_handle_ue(mci, m); + err.syndrome = extract_syndrome(m->status); + + pvt->ops->map_sysaddr_to_csrow(mci, sys_addr, &err); + + __log_bus_error(mci, &err, ecc_type); } void amd64_decode_bus_error(int node_id, struct mce *m) diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index cf7981e1f063..abefab4722c2 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -376,6 +376,23 @@ struct amd64_pvt { struct error_injection injection; }; +enum err_codes { + DECODE_OK = 0, + ERR_NODE = -1, + ERR_CSROW = -2, + ERR_CHANNEL = -3, +}; + +struct err_info { + int err_code; + struct mem_ctl_info *src_mci; + int csrow; + int channel; + u16 syndrome; + u32 page; + u32 offset; +}; + static inline u64 get_dram_base(struct amd64_pvt *pvt, unsigned i) { u64 addr = ((u64)pvt->ranges[i].base.lo & 0xffff0000) << 8; @@ -449,7 +466,7 @@ static inline void amd64_remove_sysfs_inject_files(struct mem_ctl_info *mci) struct low_ops { int (*early_channel_count) (struct amd64_pvt *pvt); void (*map_sysaddr_to_csrow) (struct mem_ctl_info *mci, u64 sys_addr, - u16 syndrome); + struct err_info *); int (*dbam_to_cs) (struct amd64_pvt *pvt, u8 dct, unsigned cs_mode); int (*read_dct_pci_cfg) (struct amd64_pvt *pvt, int offset, u32 *val, const char *func); From bb89f5a0547a56cf2406f1c3d6cd44f8fa62256d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Sep 2012 18:06:00 +0200 Subject: [PATCH 10/24] amd64_edac: Fix K8 chip select reporting This basically reverts 603adaf6b3e3 ("amd64_edac: fix K8 chip select reporting") because it was a clumsy workaround for DIMM sizes reporting on K8 which got superceded by a much more correct one with 41d8bfaba70 ("amd64_edac: Improve DRAM address mapping") without removing the prior one. Remove it now finally. Reported-by: Josh Hunt Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index d21efb246d43..cfb7e57786d1 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1596,14 +1596,11 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, */ static void amd64_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) { - int dimm, size0, size1, factor = 0; + int dimm, size0, size1; u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; u32 dbam = ctrl ? pvt->dbam1 : pvt->dbam0; if (boot_cpu_data.x86 == 0xf) { - if (pvt->dclr0 & WIDTH_128) - factor = 1; - /* K8 families < revF not supported yet */ if (pvt->ext_model < K8_REV_F) return; @@ -1634,8 +1631,8 @@ static void amd64_debug_display_dimm_sizes(struct amd64_pvt *pvt, u8 ctrl) DBAM_DIMM(dimm, dbam)); amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", - dimm * 2, size0 << factor, - dimm * 2 + 1, size1 << factor); + dimm * 2, size0, + dimm * 2 + 1, size1); } } From 0a5dfc31405d9b07a5b37f150815b9ad09685460 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Sep 2012 18:16:01 +0200 Subject: [PATCH 11/24] amd64_edac: Use DBAM_DIMM macro Instead of open-coding it, use the DBAM_DIMM macro in amd64_csrow_nr_pages() which we have already. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 2 +- drivers/edac/amd64_edac.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index cfb7e57786d1..89cd71ea0e50 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2094,7 +2094,7 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) * number of bits to shift the DBAM register to extract the proper CSROW * field. */ - cs_mode = (dbam >> ((csrow_nr / 2) * 4)) & 0xF; + cs_mode = DBAM_DIMM(csrow_nr / 2, dbam); nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode) << (20 - PAGE_SHIFT); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index abefab4722c2..e864f407806c 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -219,7 +219,7 @@ #define DBAM1 0x180 /* Extract the DIMM 'type' on the i'th DIMM from the DBAM reg value passed */ -#define DBAM_DIMM(i, reg) ((((reg) >> (4*i))) & 0xF) +#define DBAM_DIMM(i, reg) ((((reg) >> (4*(i)))) & 0xF) #define DBAM_MAX_VALUE 11 From 10de6497a56e933d9ddca94aff186ac3b3105af9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Sep 2012 19:00:38 +0200 Subject: [PATCH 12/24] amd64_edac: Fix csrows size and pages computation Make sure code pays attention to K8 having only one DCT, reformat and cleanup code, correct debug messages, remove unused code. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 50 ++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 89cd71ea0e50..cb64bec374d9 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2087,6 +2087,7 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) u32 cs_mode, nr_pages; u32 dbam = dct ? pvt->dbam1 : pvt->dbam0; + /* * The math on this doesn't look right on the surface because x/2*4 can * be simplified to x*2 but this expression makes use of the fact that @@ -2098,9 +2099,9 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode) << (20 - PAGE_SHIFT); - edac_dbg(0, " (csrow=%d) DBAM map index= %d\n", csrow_nr, cs_mode); - edac_dbg(0, " nr_pages/channel= %u channel-count = %d\n", - nr_pages, pvt->channel_count); + edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n", + csrow_nr, dct, cs_mode); + edac_dbg(0, "nr_pages/channel: %u\n", nr_pages); return nr_pages; } @@ -2111,15 +2112,14 @@ static u32 amd64_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) */ static int init_csrows(struct mem_ctl_info *mci) { + struct amd64_pvt *pvt = mci->pvt_info; struct csrow_info *csrow; struct dimm_info *dimm; - struct amd64_pvt *pvt = mci->pvt_info; - u64 base, mask; - u32 val; - int i, j, empty = 1; - enum mem_type mtype; enum edac_type edac_mode; + enum mem_type mtype; + int i, j, empty = 1; int nr_pages = 0; + u32 val; amd64_read_pci_cfg(pvt->F3, NBCFG, &val); @@ -2129,29 +2129,35 @@ static int init_csrows(struct mem_ctl_info *mci) pvt->mc_node_id, val, !!(val & NBCFG_CHIPKILL), !!(val & NBCFG_ECC_ENABLE)); + /* + * We iterate over DCT0 here but we look at DCT1 in parallel, if needed. + */ for_each_chip_select(i, 0, pvt) { - csrow = mci->csrows[i]; + bool row_dct0 = !!csrow_enabled(i, 0, pvt); + bool row_dct1 = false; - if (!csrow_enabled(i, 0, pvt) && !csrow_enabled(i, 1, pvt)) { - edac_dbg(1, "----CSROW %d VALID for MC node %d\n", - i, pvt->mc_node_id); + if (boot_cpu_data.x86 != 0xf) + row_dct1 = !!csrow_enabled(i, 1, pvt); + + if (!row_dct0 && !row_dct1) continue; - } + csrow = mci->csrows[i]; empty = 0; - if (csrow_enabled(i, 0, pvt)) - nr_pages = amd64_csrow_nr_pages(pvt, 0, i); - if (csrow_enabled(i, 1, pvt)) - nr_pages += amd64_csrow_nr_pages(pvt, 1, i); - get_cs_base_and_mask(pvt, i, 0, &base, &mask); - /* 8 bytes of resolution */ + edac_dbg(1, "MC node: %d, csrow: %d\n", + pvt->mc_node_id, i); + + if (row_dct0) + nr_pages = amd64_csrow_nr_pages(pvt, 0, i); + + /* K8 has only one DCT */ + if (boot_cpu_data.x86 != 0xf && row_dct1) + nr_pages += amd64_csrow_nr_pages(pvt, 1, i); mtype = amd64_determine_memory_type(pvt, i); - edac_dbg(1, " for MC node %d csrow %d:\n", pvt->mc_node_id, i); - edac_dbg(1, " nr_pages: %u\n", - nr_pages * pvt->channel_count); + edac_dbg(1, "Total csrow%d pages: %u\n", i, nr_pages); /* * determine whether CHIPKILL or JUST ECC or NO ECC is operating From 11652769179296062c74233e168399a87a3f6e8a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 13 Sep 2012 17:19:40 +0200 Subject: [PATCH 13/24] EDAC: Add memory controller flags The first flag is ->csbased and will be used in common EDAC code later. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 1 + include/linux/edac.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index cb64bec374d9..307ff66266a0 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2518,6 +2518,7 @@ static int amd64_init_one_instance(struct pci_dev *F2) mci->pvt_info = pvt; mci->pdev = &pvt->F2->dev; + mci->csbased = 1; setup_mci_misc_attrs(mci, fam_type); diff --git a/include/linux/edac.h b/include/linux/edac.h index bab9f8473dc1..07bda01bf20a 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -667,6 +667,8 @@ struct mem_ctl_info { u32 fake_inject_ue; u16 fake_inject_count; #endif + __u8 csbased : 1, /* csrow-based memory controller */ + __resv : 7; }; #endif From 921a689965c26ae6fde12ebaadb4183cbfdb01fb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 13 Sep 2012 18:46:39 +0200 Subject: [PATCH 14/24] EDAC: Pass mci parent Initialize the mem_ctl_info descriptor of a csrow properly. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc_sysfs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index ed0bc07b8503..cf13bff94f5c 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -373,6 +373,7 @@ static int edac_create_csrow_object(struct mem_ctl_info *mci, csrow->dev.bus = &mci->bus; device_initialize(&csrow->dev); csrow->dev.parent = &mci->dev; + csrow->mci = mci; dev_set_name(&csrow->dev, "csrow%d", index); dev_set_drvdata(&csrow->dev, csrow); From 16a528ee3975c860dc93fbfc718fe9aa25ed92bc Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 13 Sep 2012 18:53:58 +0200 Subject: [PATCH 15/24] EDAC: Fix csrow size reported in sysfs On csrow-based memory controllers, we combine the csrow size from both channels and there's no need to do that again in csrow_size_show which leads to double the size of a csrow. Fix it. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 1 + drivers/edac/edac_mc_sysfs.c | 3 +++ include/linux/edac.h | 1 + 3 files changed, 5 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 307ff66266a0..f74a684269ff 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2174,6 +2174,7 @@ static int init_csrows(struct mem_ctl_info *mci) dimm->edac_mode = edac_mode; dimm->nr_pages = nr_pages; } + csrow->nr_pages = nr_pages; } return empty; diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index cf13bff94f5c..bd46610979c7 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -180,6 +180,9 @@ static ssize_t csrow_size_show(struct device *dev, int i; u32 nr_pages = 0; + if (csrow->mci->csbased) + return sprintf(data, "%u\n", PAGES_TO_MiB(csrow->nr_pages)); + for (i = 0; i < csrow->nr_channels; i++) nr_pages += csrow->channels[i]->dimm->nr_pages; return sprintf(data, "%u\n", PAGES_TO_MiB(nr_pages)); diff --git a/include/linux/edac.h b/include/linux/edac.h index 07bda01bf20a..1b8c02b36f76 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -533,6 +533,7 @@ struct csrow_info { u32 ue_count; /* Uncorrectable Errors for this csrow */ u32 ce_count; /* Correctable Errors for this csrow */ + u32 nr_pages; /* combined pages count of all channels */ struct mem_ctl_info *mci; /* the parent */ From 3c0622760aaa4731e2fd3a7472a96b59d2caecc4 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Fri, 21 Sep 2012 07:45:49 -0700 Subject: [PATCH 16/24] EDAC: Fix mc size reported in sysfs This is the complement to previous commit "EDAC: Fix csrow size reported in sysfs". This fixes the memory controller size reporting on csrow-based memory controllers. The csrow size is already combined for both channels. Without this patch memory size is reported doubled. Signed-off-by: Josh Hunt Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc_sysfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index bd46610979c7..a242dae1aeb0 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -781,10 +781,14 @@ static ssize_t mci_size_mb_show(struct device *dev, for (csrow_idx = 0; csrow_idx < mci->nr_csrows; csrow_idx++) { struct csrow_info *csrow = mci->csrows[csrow_idx]; - for (j = 0; j < csrow->nr_channels; j++) { - struct dimm_info *dimm = csrow->channels[j]->dimm; + if (csrow->mci->csbased) { + total_pages += csrow->nr_pages; + } else { + for (j = 0; j < csrow->nr_channels; j++) { + struct dimm_info *dimm = csrow->channels[j]->dimm; - total_pages += dimm->nr_pages; + total_pages += dimm->nr_pages; + } } } From f35d852e8038d6eccd0cce3cf1df1d4bd7d2c473 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Oct 2012 16:30:27 +0800 Subject: [PATCH 17/24] EDAC, Calxeda highbank: Convert to use simple_open() This removes an open coded simple_open() function and replaces file operations references to the function with simple_open() instead. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Cc: Rob Herring Signed-off-by: Wei Yongjun Signed-off-by: Borislav Petkov --- drivers/edac/highbank_mc_edac.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/edac/highbank_mc_edac.c b/drivers/edac/highbank_mc_edac.c index c769f477fd22..7ea4cc2e8bd2 100644 --- a/drivers/edac/highbank_mc_edac.c +++ b/drivers/edac/highbank_mc_edac.c @@ -113,14 +113,8 @@ static ssize_t highbank_mc_err_inject_write(struct file *file, return count; } -static int debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static const struct file_operations highbank_mc_debug_inject_fops = { - .open = debugfs_open, + .open = simple_open, .write = highbank_mc_err_inject_write, .llseek = generic_file_llseek, }; From db7312a295ec113fa7b3f7486c4b62b936a357d3 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 17 Oct 2012 16:32:01 +0800 Subject: [PATCH 18/24] EDAC: Convert to use simple_open() This removes an open coded simple_open() function and replaces file operations references to the function with simple_open() instead. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc_sysfs.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index a242dae1aeb0..52813b8b0f5f 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -846,14 +846,8 @@ static ssize_t edac_fake_inject_write(struct file *file, return count; } -static int debugfs_open(struct inode *inode, struct file *file) -{ - file->private_data = inode->i_private; - return 0; -} - static const struct file_operations debug_fake_inject_fops = { - .open = debugfs_open, + .open = simple_open, .write = edac_fake_inject_write, .llseek = generic_file_llseek, }; From f05c41a9c6057a0d5851ebc9589e3834fde1a4b6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 11 Sep 2012 18:57:43 +0200 Subject: [PATCH 19/24] MCE, AMD: Remove functional unit references Having the functional unit names in each bank decode is only misleading as this code supports multiple families and there's no guarantee the mapping between FUs and MCE banks will stay the same. And also, knowing the functional unit name doesn't help much since you end up looking at the respective BKDG anyway. So drop all FU references and use the MC bank numbers instead. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 186 ++++++++++++++++++++--------------------- drivers/edac/mce_amd.h | 5 +- 2 files changed, 94 insertions(+), 97 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index d0c372e30de4..6b38c1186922 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -64,7 +64,7 @@ EXPORT_SYMBOL_GPL(to_msgs); const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); -static const char * const f15h_ic_mce_desc[] = { +static const char * const f15h_mc1_mce_desc[] = { "UC during a demand linefill from L2", "Parity error during data load from IC", "Parity error for IC valid bit", @@ -84,7 +84,7 @@ static const char * const f15h_ic_mce_desc[] = { "fetch address FIFO" }; -static const char * const f15h_cu_mce_desc[] = { +static const char * const f15h_mc2_mce_desc[] = { "Fill ECC error on data fills", /* xec = 0x4 */ "Fill parity error on insn fills", "Prefetcher request FIFO parity error", @@ -101,7 +101,7 @@ static const char * const f15h_cu_mce_desc[] = { "PRB address parity error" }; -static const char * const nb_mce_desc[] = { +static const char * const mc4_mce_desc[] = { "DRAM ECC error detected on the NB", "CRC error detected on HT link", "Link-defined sync error packets detected on HT link", @@ -123,7 +123,7 @@ static const char * const nb_mce_desc[] = { "ECC Error in the Probe Filter directory" }; -static const char * const fr_ex_mce_desc[] = { +static const char * const mc5_mce_desc[] = { "CPU Watchdog timer expire", "Wakeup array dest tag", "AG payload array", @@ -139,7 +139,7 @@ static const char * const fr_ex_mce_desc[] = { "DE error occurred" }; -static bool f12h_dc_mce(u16 ec, u8 xec) +static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -157,26 +157,26 @@ static bool f12h_dc_mce(u16 ec, u8 xec) return ret; } -static bool f10h_dc_mce(u16 ec, u8 xec) +static bool f10h_mc0_mce(u16 ec, u8 xec) { if (R4(ec) == R4_GEN && LL(ec) == LL_L1) { pr_cont("during data scrub.\n"); return true; } - return f12h_dc_mce(ec, xec); + return f12h_mc0_mce(ec, xec); } -static bool k8_dc_mce(u16 ec, u8 xec) +static bool k8_mc0_mce(u16 ec, u8 xec) { if (BUS_ERROR(ec)) { pr_cont("during system linefill.\n"); return true; } - return f10h_dc_mce(ec, xec); + return f10h_mc0_mce(ec, xec); } -static bool f14h_dc_mce(u16 ec, u8 xec) +static bool f14h_mc0_mce(u16 ec, u8 xec) { u8 r4 = R4(ec); bool ret = true; @@ -228,7 +228,7 @@ static bool f14h_dc_mce(u16 ec, u8 xec) return ret; } -static bool f15h_dc_mce(u16 ec, u8 xec) +static bool f15h_mc0_mce(u16 ec, u8 xec) { bool ret = true; @@ -275,12 +275,12 @@ static bool f15h_dc_mce(u16 ec, u8 xec) return ret; } -static void amd_decode_dc_mce(struct mce *m) +static void decode_mc0_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Data Cache Error: "); + pr_emerg(HW_ERR "MC0 Error: "); /* TLB error signatures are the same across families */ if (TLB_ERROR(ec)) { @@ -290,13 +290,13 @@ static void amd_decode_dc_mce(struct mce *m) : (xec ? "multimatch" : "parity"))); return; } - } else if (fam_ops->dc_mce(ec, xec)) + } else if (fam_ops->mc0_mce(ec, xec)) ; else - pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); + pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n"); } -static bool k8_ic_mce(u16 ec, u8 xec) +static bool k8_mc1_mce(u16 ec, u8 xec) { u8 ll = LL(ec); bool ret = true; @@ -330,7 +330,7 @@ static bool k8_ic_mce(u16 ec, u8 xec) return ret; } -static bool f14h_ic_mce(u16 ec, u8 xec) +static bool f14h_mc1_mce(u16 ec, u8 xec) { u8 r4 = R4(ec); bool ret = true; @@ -349,7 +349,7 @@ static bool f14h_ic_mce(u16 ec, u8 xec) return ret; } -static bool f15h_ic_mce(u16 ec, u8 xec) +static bool f15h_mc1_mce(u16 ec, u8 xec) { bool ret = true; @@ -358,19 +358,19 @@ static bool f15h_ic_mce(u16 ec, u8 xec) switch (xec) { case 0x0 ... 0xa: - pr_cont("%s.\n", f15h_ic_mce_desc[xec]); + pr_cont("%s.\n", f15h_mc1_mce_desc[xec]); break; case 0xd: - pr_cont("%s.\n", f15h_ic_mce_desc[xec-2]); + pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]); break; case 0x10: - pr_cont("%s.\n", f15h_ic_mce_desc[xec-4]); + pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]); break; case 0x11 ... 0x14: - pr_cont("Decoder %s parity error.\n", f15h_ic_mce_desc[xec-4]); + pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]); break; default: @@ -379,12 +379,12 @@ static bool f15h_ic_mce(u16 ec, u8 xec) return ret; } -static void amd_decode_ic_mce(struct mce *m) +static void decode_mc1_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Instruction Cache Error: "); + pr_emerg(HW_ERR "MC1 Error: "); if (TLB_ERROR(ec)) pr_cont("%s TLB %s.\n", LL_MSG(ec), @@ -393,18 +393,18 @@ static void amd_decode_ic_mce(struct mce *m) bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); - } else if (fam_ops->ic_mce(ec, xec)) + } else if (fam_ops->mc1_mce(ec, xec)) ; else - pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); + pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n"); } -static void amd_decode_bu_mce(struct mce *m) +static void decode_mc2_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Bus Unit Error"); + pr_emerg(HW_ERR "MC2 Error"); if (xec == 0x1) pr_cont(" in the write data buffers.\n"); @@ -429,24 +429,24 @@ static void amd_decode_bu_mce(struct mce *m) pr_cont(": %s parity/ECC error during data " "access from L2.\n", R4_MSG(ec)); else - goto wrong_bu_mce; + goto wrong_mc2_mce; } else - goto wrong_bu_mce; + goto wrong_mc2_mce; } else - goto wrong_bu_mce; + goto wrong_mc2_mce; return; -wrong_bu_mce: - pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); + wrong_mc2_mce: + pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); } -static void amd_decode_cu_mce(struct mce *m) +static void decode_f15_mc2_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Combined Unit Error: "); + pr_emerg(HW_ERR "MC2 Error: "); if (TLB_ERROR(ec)) { if (xec == 0x0) @@ -454,63 +454,63 @@ static void amd_decode_cu_mce(struct mce *m) else if (xec == 0x1) pr_cont("Poison data provided for TLB fill.\n"); else - goto wrong_cu_mce; + goto wrong_f15_mc2_mce; } else if (BUS_ERROR(ec)) { if (xec > 2) - goto wrong_cu_mce; + goto wrong_f15_mc2_mce; pr_cont("Error during attempted NB data read.\n"); } else if (MEM_ERROR(ec)) { switch (xec) { case 0x4 ... 0xc: - pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x4]); + pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]); break; case 0x10 ... 0x14: - pr_cont("%s.\n", f15h_cu_mce_desc[xec - 0x7]); + pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]); break; default: - goto wrong_cu_mce; + goto wrong_f15_mc2_mce; } } return; -wrong_cu_mce: - pr_emerg(HW_ERR "Corrupted CU MCE info?\n"); + wrong_f15_mc2_mce: + pr_emerg(HW_ERR "Corrupted MC2 MCE info?\n"); } -static void amd_decode_ls_mce(struct mce *m) +static void decode_mc3_mce(struct mce *m) { u16 ec = EC(m->status); u8 xec = XEC(m->status, xec_mask); if (boot_cpu_data.x86 >= 0x14) { - pr_emerg("You shouldn't be seeing an LS MCE on this cpu family," + pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family," " please report on LKML.\n"); return; } - pr_emerg(HW_ERR "Load Store Error"); + pr_emerg(HW_ERR "MC3 Error"); if (xec == 0x0) { u8 r4 = R4(ec); if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) - goto wrong_ls_mce; + goto wrong_mc3_mce; pr_cont(" during %s.\n", R4_MSG(ec)); } else - goto wrong_ls_mce; + goto wrong_mc3_mce; return; -wrong_ls_mce: - pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); + wrong_mc3_mce: + pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n"); } -void amd_decode_nb_mce(struct mce *m) +static void decode_mc4_mce(struct mce *m) { struct cpuinfo_x86 *c = &boot_cpu_data; int node_id = amd_get_nb_id(m->extcpu); @@ -518,7 +518,7 @@ void amd_decode_nb_mce(struct mce *m) u8 xec = XEC(m->status, 0x1f); u8 offset = 0; - pr_emerg(HW_ERR "Northbridge Error (node %d): ", node_id); + pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id); switch (xec) { case 0x0 ... 0xe: @@ -527,9 +527,9 @@ void amd_decode_nb_mce(struct mce *m) if (xec == 0x0 || xec == 0x8) { /* no ECCs on F11h */ if (c->x86 == 0x11) - goto wrong_nb_mce; + goto wrong_mc4_mce; - pr_cont("%s.\n", nb_mce_desc[xec]); + pr_cont("%s.\n", mc4_mce_desc[xec]); if (nb_bus_decoder) nb_bus_decoder(node_id, m); @@ -543,14 +543,14 @@ void amd_decode_nb_mce(struct mce *m) else if (BUS_ERROR(ec)) pr_cont("DMA Exclusion Vector Table Walk error.\n"); else - goto wrong_nb_mce; + goto wrong_mc4_mce; return; case 0x19: if (boot_cpu_data.x86 == 0x15) pr_cont("Compute Unit Data Error.\n"); else - goto wrong_nb_mce; + goto wrong_mc4_mce; return; case 0x1c ... 0x1f: @@ -558,46 +558,44 @@ void amd_decode_nb_mce(struct mce *m) break; default: - goto wrong_nb_mce; + goto wrong_mc4_mce; } - pr_cont("%s.\n", nb_mce_desc[xec - offset]); + pr_cont("%s.\n", mc4_mce_desc[xec - offset]); return; -wrong_nb_mce: - pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); + wrong_mc4_mce: + pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n"); } -EXPORT_SYMBOL_GPL(amd_decode_nb_mce); -static void amd_decode_fr_mce(struct mce *m) +static void decode_mc5_mce(struct mce *m) { struct cpuinfo_x86 *c = &boot_cpu_data; u8 xec = XEC(m->status, xec_mask); if (c->x86 == 0xf || c->x86 == 0x11) - goto wrong_fr_mce; + goto wrong_mc5_mce; - pr_emerg(HW_ERR "%s Error: ", - (c->x86 == 0x15 ? "Execution Unit" : "FIROB")); + pr_emerg(HW_ERR "MC5 Error: "); if (xec == 0x0 || xec == 0xc) - pr_cont("%s.\n", fr_ex_mce_desc[xec]); + pr_cont("%s.\n", mc5_mce_desc[xec]); else if (xec < 0xd) - pr_cont("%s parity error.\n", fr_ex_mce_desc[xec]); + pr_cont("%s parity error.\n", mc5_mce_desc[xec]); else - goto wrong_fr_mce; + goto wrong_mc5_mce; return; -wrong_fr_mce: - pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); + wrong_mc5_mce: + pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n"); } -static void amd_decode_fp_mce(struct mce *m) +static void decode_mc6_mce(struct mce *m) { u8 xec = XEC(m->status, xec_mask); - pr_emerg(HW_ERR "Floating Point Unit Error: "); + pr_emerg(HW_ERR "MC6 Error: "); switch (xec) { case 0x1: @@ -621,7 +619,7 @@ static void amd_decode_fp_mce(struct mce *m) break; default: - goto wrong_fp_mce; + goto wrong_mc6_mce; break; } @@ -629,8 +627,8 @@ static void amd_decode_fp_mce(struct mce *m) return; -wrong_fp_mce: - pr_emerg(HW_ERR "Corrupted FP MCE info?\n"); + wrong_mc6_mce: + pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } static inline void amd_decode_err_code(u16 ec) @@ -703,34 +701,34 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) switch (m->bank) { case 0: - amd_decode_dc_mce(m); + decode_mc0_mce(m); break; case 1: - amd_decode_ic_mce(m); + decode_mc1_mce(m); break; case 2: if (c->x86 == 0x15) - amd_decode_cu_mce(m); + decode_f15_mc2_mce(m); else - amd_decode_bu_mce(m); + decode_mc2_mce(m); break; case 3: - amd_decode_ls_mce(m); + decode_mc3_mce(m); break; case 4: - amd_decode_nb_mce(m); + decode_mc4_mce(m); break; case 5: - amd_decode_fr_mce(m); + decode_mc5_mce(m); break; case 6: - amd_decode_fp_mce(m); + decode_mc6_mce(m); break; default: @@ -763,35 +761,35 @@ static int __init mce_amd_init(void) switch (c->x86) { case 0xf: - fam_ops->dc_mce = k8_dc_mce; - fam_ops->ic_mce = k8_ic_mce; + fam_ops->mc0_mce = k8_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; break; case 0x10: - fam_ops->dc_mce = f10h_dc_mce; - fam_ops->ic_mce = k8_ic_mce; + fam_ops->mc0_mce = f10h_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; break; case 0x11: - fam_ops->dc_mce = k8_dc_mce; - fam_ops->ic_mce = k8_ic_mce; + fam_ops->mc0_mce = k8_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; break; case 0x12: - fam_ops->dc_mce = f12h_dc_mce; - fam_ops->ic_mce = k8_ic_mce; + fam_ops->mc0_mce = f12h_mc0_mce; + fam_ops->mc1_mce = k8_mc1_mce; break; case 0x14: nb_err_cpumask = 0x3; - fam_ops->dc_mce = f14h_dc_mce; - fam_ops->ic_mce = f14h_ic_mce; + fam_ops->mc0_mce = f14h_mc0_mce; + fam_ops->mc1_mce = f14h_mc1_mce; break; case 0x15: xec_mask = 0x1f; - fam_ops->dc_mce = f15h_dc_mce; - fam_ops->ic_mce = f15h_ic_mce; + fam_ops->mc0_mce = f15h_mc0_mce; + fam_ops->mc1_mce = f15h_mc1_mce; break; default: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 8c87a5e87057..942f382ecb64 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -78,14 +78,13 @@ extern const char * const ii_msgs[]; * per-family decoder ops */ struct amd_decoder_ops { - bool (*dc_mce)(u16, u8); - bool (*ic_mce)(u16, u8); + bool (*mc0_mce)(u16, u8); + bool (*mc1_mce)(u16, u8); }; void amd_report_gart_errors(bool); void amd_register_ecc_decoder(void (*f)(int, struct mce *)); void amd_unregister_ecc_decoder(void (*f)(int, struct mce *)); -void amd_decode_nb_mce(struct mce *); int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data); #endif /* _EDAC_MCE_AMD_H */ From f89f8388cd11faa8e77992cb11ab44ac9a6abf4f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 13 Sep 2012 15:14:22 +0200 Subject: [PATCH 20/24] MCE, AMD: Dump CPU f/m/s triple with the error It is very useful to have the family/model/stepping with the reported error so dump it. This saves us asking the bug reporter about it. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 6b38c1186922..e871a2abc802 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -670,14 +670,16 @@ static bool amd_filter_mce(struct mce *m) int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; - struct cpuinfo_x86 *c = &boot_cpu_data; + struct cpuinfo_x86 *c = &cpu_data(m->extcpu); int ecc; if (amd_filter_mce(m)) return NOTIFY_STOP; - pr_emerg(HW_ERR "CPU:%d\tMC%d_STATUS[%s|%s|%s|%s|%s", - m->extcpu, m->bank, + pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", + m->extcpu, + c->x86, c->x86_model, c->x86_mask, + m->bank, ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), ((m->status & MCI_STATUS_UC) ? "UE" : "CE"), ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), @@ -697,7 +699,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) pr_cont("]: 0x%016llx\n", m->status); if (m->status & MCI_STATUS_ADDRV) - pr_emerg(HW_ERR "\tMC%d_ADDR: 0x%016llx\n", m->bank, m->addr); + pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr); switch (m->bank) { case 0: From d824c7718b78b6a5afae7fc78731b70318cd076f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 14 Sep 2012 20:10:59 +0200 Subject: [PATCH 21/24] MCE, AMD: Report decoded error type first Instead of starting with the error details, report the decoded, readable error type first. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e871a2abc802..f856a2531cec 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -676,31 +676,6 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (amd_filter_mce(m)) return NOTIFY_STOP; - pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", - m->extcpu, - c->x86, c->x86_model, c->x86_mask, - m->bank, - ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), - ((m->status & MCI_STATUS_UC) ? "UE" : "CE"), - ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), - ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), - ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - - if (c->x86 == 0x15) - pr_cont("|%s|%s", - ((m->status & BIT_64(44)) ? "Deferred" : "-"), - ((m->status & BIT_64(43)) ? "Poison" : "-")); - - /* do the two bits[14:13] together */ - ecc = (m->status >> 45) & 0x3; - if (ecc) - pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); - - pr_cont("]: 0x%016llx\n", m->status); - - if (m->status & MCI_STATUS_ADDRV) - pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr); - switch (m->bank) { case 0: decode_mc0_mce(m); @@ -737,6 +712,31 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) break; } + pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", + m->extcpu, + c->x86, c->x86_model, c->x86_mask, + m->bank, + ((m->status & MCI_STATUS_OVER) ? "Over" : "-"), + ((m->status & MCI_STATUS_UC) ? "UE" : "CE"), + ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), + ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), + ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); + + if (c->x86 == 0x15) + pr_cont("|%s|%s", + ((m->status & BIT_64(44)) ? "Deferred" : "-"), + ((m->status & BIT_64(43)) ? "Poison" : "-")); + + /* do the two bits[14:13] together */ + ecc = (m->status >> 45) & 0x3; + if (ecc) + pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); + + pr_cont("]: 0x%016llx\n", m->status); + + if (m->status & MCI_STATUS_ADDRV) + pr_emerg(HW_ERR "MC%d_ADDR: 0x%016llx\n", m->bank, m->addr); + amd_decode_err_code(m->status & 0xffff); return NOTIFY_STOP; From d5c6770d4cb27bc33aa433cf8fb848ad9af6644b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 14 Sep 2012 20:25:37 +0200 Subject: [PATCH 22/24] MCE, AMD: Dump error status Dump error status after decoding the error which describes the error disposition. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 22 ++++++++++++++++++++-- drivers/edac/mce_amd.h | 6 ++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index f856a2531cec..ad637572d8c7 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -667,6 +667,22 @@ static bool amd_filter_mce(struct mce *m) return false; } +static const char *decode_error_status(struct mce *m) +{ + if (m->status & MCI_STATUS_UC) { + if (m->status & MCI_STATUS_PCC) + return "System Fatal error."; + if (m->mcgstatus & MCG_STATUS_RIPV) + return "Uncorrected, software restartable error."; + return "Uncorrected, software containable error."; + } + + if (m->status & MCI_STATUS_DEFERRED) + return "Deferred error."; + + return "Corrected error, no action required."; +} + int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; @@ -712,6 +728,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) break; } + pr_emerg(HW_ERR "Error Status: %s\n", decode_error_status(m)); + pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s", m->extcpu, c->x86, c->x86_model, c->x86_mask, @@ -724,8 +742,8 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (c->x86 == 0x15) pr_cont("|%s|%s", - ((m->status & BIT_64(44)) ? "Deferred" : "-"), - ((m->status & BIT_64(43)) ? "Poison" : "-")); + ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), + ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); /* do the two bits[14:13] together */ ecc = (m->status >> 45) & 0x3; diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 942f382ecb64..679679951e23 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -29,10 +29,8 @@ #define R4(x) (((x) >> 4) & 0xf) #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") -/* - * F3x4C bits (MCi_STATUS' high half) - */ -#define NBSH_ERR_CPU_VAL BIT(24) +#define MCI_STATUS_DEFERRED BIT_64(44) +#define MCI_STATUS_POISON BIT_64(43) enum tt_ids { TT_INSTR = 0, From 2d56b109e3a50cf316e60f07541bbeb1d8fe251a Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Thu, 25 Oct 2012 19:42:58 +0400 Subject: [PATCH 23/24] EDAC: Handle error path in edac_mc_sysfs_init() properly Make sure proper deregistration happens on all error paths in edac_mc_sysfs_init. Signed-off-by: Denis Kirjanov [ Boris: cleanup and concretize commit message ] Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc_sysfs.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 52813b8b0f5f..de2df92f9c77 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -1126,10 +1126,15 @@ int __init edac_mc_sysfs_init(void) edac_subsys = edac_get_sysfs_subsys(); if (edac_subsys == NULL) { edac_dbg(1, "no edac_subsys\n"); - return -EINVAL; + err = -EINVAL; + goto out; } mci_pdev = kzalloc(sizeof(*mci_pdev), GFP_KERNEL); + if (!mci_pdev) { + err = -ENOMEM; + goto out_put_sysfs; + } mci_pdev->bus = edac_subsys; mci_pdev->type = &mc_attr_type; @@ -1138,11 +1143,18 @@ int __init edac_mc_sysfs_init(void) err = device_add(mci_pdev); if (err < 0) - return err; + goto out_dev_free; edac_dbg(0, "device %s created\n", dev_name(mci_pdev)); return 0; + + out_dev_free: + kfree(mci_pdev); + out_put_sysfs: + edac_put_sysfs_subsys(); + out: + return err; } void __exit edac_mc_sysfs_exit(void) @@ -1150,4 +1162,5 @@ void __exit edac_mc_sysfs_exit(void) put_device(mci_pdev); device_del(mci_pdev); edac_put_sysfs_subsys(); + kfree(mci_pdev); } From 3bfe5aae8edd8436d26cddfeab783492d8950821 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 4 Dec 2012 00:01:42 -0500 Subject: [PATCH 24/24] EDAC, pci_sysfs: Use for_each_pci_dev to simplify the code Use for_each_pci_dev to simplify the code. Signed-off-by: Wei Yongjun [Boris: cleanup comments and drop loop brackets] Signed-off-by: Borislav Petkov --- drivers/edac/edac_pci_sysfs.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index e164c555a337..dc6e905ee1a5 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -645,20 +645,16 @@ typedef void (*pci_parity_check_fn_t) (struct pci_dev *dev); /* * pci_dev parity list iterator - * Scan the PCI device list for one pass, looking for SERRORs - * Master Parity ERRORS or Parity ERRORs on primary or secondary devices + * + * Scan the PCI device list looking for SERRORs, Master Parity ERRORS or + * Parity ERRORs on primary or secondary devices. */ static inline void edac_pci_dev_parity_iterator(pci_parity_check_fn_t fn) { struct pci_dev *dev = NULL; - /* request for kernel access to the next PCI device, if any, - * and while we are looking at it have its reference count - * bumped until we are done with it - */ - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) fn(dev); - } } /*