sb_edac: Fix support for systems with two home agents per socket

First noticed a problem on a 4 socket machine where EDAC only reported
half the DIMMS.  Tracked this down to the code that assumes that systems
with two home agents only have two memory channels on each agent. This
is true on 2 sockect ("-EP") machines. But four socket ("-EX") machines
have four memory channels on each home agent.

The old code would have had problems on two socket systems as it did
a shuffling trick to make the internals of the code think that the
channels from the first agent were '0' and '1', with the second agent
providing '2' and '3'. But the code didn't uniformly convert from
{ha,channel} tuples to this internal representation.

New code always considers up to eight channels.
On a machine with a single home agent these map easily to edac channels
0, 1, 2, 3. On machines with two home agents we map using:
  edac_channel = 4*ha# + channel
So on a -EP machine where each home agent supports only two channels
we'll fill in channels 0, 1, 4, 5, and on a -EX machine we use all of 0,
1, 2, 3, 4, 5, 6, 7.

[mchehab@osg.samsung.com: fold a fixup patch as per Tony's request and fixed
 a few CodingStyle issues]
Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
This commit is contained in:
Tony Luck 2015-05-18 17:50:42 -03:00 committed by Mauro Carvalho Chehab
parent bb89e7141a
commit 7d375bffa5

View File

@ -34,7 +34,7 @@ static int probed;
/*
* Alter this version for the module when modifications are made
*/
#define SBRIDGE_REVISION " Ver: 1.1.0 "
#define SBRIDGE_REVISION " Ver: 1.1.1 "
#define EDAC_MOD_STR "sbridge_edac"
/*
@ -254,7 +254,7 @@ static const u32 correrrthrsld[] = {
* sbridge structs
*/
#define NUM_CHANNELS 4
#define NUM_CHANNELS 8 /* 2MC per socket, four chan per MC */
#define MAX_DIMMS 3 /* Max DIMMS per channel */
#define CHANNEL_UNSPECIFIED 0xf /* Intel IA32 SDM 15-14 */
@ -393,6 +393,8 @@ static const struct pci_id_table pci_dev_descr_sbridge_table[] = {
#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_RAS 0x0e79
#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 0x0e6a
#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1 0x0e6b
#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2 0x0e6c
#define PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3 0x0e6d
static const struct pci_id_descr pci_dev_descr_ibridge[] = {
/* Processor Home Agent */
@ -421,6 +423,8 @@ static const struct pci_id_descr pci_dev_descr_ibridge[] = {
#endif
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0, 1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1, 1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2, 1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3, 1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_1HA_DDRIO0, 1) },
{ PCI_DESCR(PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_2HA_DDRIO0, 1) },
@ -909,6 +913,8 @@ static int get_dimm_config(struct mem_ctl_info *mci)
for (i = 0; i < NUM_CHANNELS; i++) {
u32 mtr;
if (!pvt->pci_tad[i])
continue;
for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) {
dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers,
i, j, 0);
@ -925,8 +931,8 @@ static int get_dimm_config(struct mem_ctl_info *mci)
size = ((u64)rows * cols * banks * ranks) >> (20 - 3);
npages = MiB_TO_PAGES(size);
edac_dbg(0, "mc#%d: channel %d, dimm %d, %Ld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
pvt->sbridge_dev->mc, i, j,
edac_dbg(0, "mc#%d: ha %d channel %d, dimm %d, %lld Mb (%d pages) bank: %d, rank: %d, row: %#x, col: %#x\n",
pvt->sbridge_dev->mc, i/4, i%4, j,
size, npages,
banks, ranks, rows, cols);
@ -946,8 +952,8 @@ static int get_dimm_config(struct mem_ctl_info *mci)
dimm->mtype = mtype;
dimm->edac_mode = mode;
snprintf(dimm->label, sizeof(dimm->label),
"CPU_SrcID#%u_Channel#%u_DIMM#%u",
pvt->sbridge_dev->source_id, i, j);
"CPU_SrcID#%u_Ha#%u_Chan#%u_DIMM#%u",
pvt->sbridge_dev->source_id, i/4, i%4, j);
}
}
}
@ -1128,7 +1134,7 @@ static struct mem_ctl_info *get_mci_for_node_id(u8 node_id)
static int get_memory_error_data(struct mem_ctl_info *mci,
u64 addr,
u8 *socket,
u8 *socket, u8 *ha,
long *channel_mask,
u8 *rank,
char **area_type, char *msg)
@ -1141,7 +1147,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
int interleave_mode, shiftup = 0;
unsigned sad_interleave[pvt->info.max_interleave];
u32 reg, dram_rule;
u8 ch_way, sck_way, pkg, sad_ha = 0;
u8 ch_way, sck_way, pkg, sad_ha = 0, ch_add = 0;
u32 tad_offset;
u32 rir_way;
u32 mb, gb;
@ -1254,6 +1260,8 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
pkg = sad_pkg(pvt->info.interleave_pkg, reg, idx);
*socket = sad_pkg_socket(pkg);
sad_ha = sad_pkg_ha(pkg);
if (sad_ha)
ch_add = 4;
if (a7mode) {
/* MCChanShiftUpEnable */
@ -1270,10 +1278,14 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
pkg = sad_pkg(pvt->info.interleave_pkg, reg, idx);
*socket = sad_pkg_socket(pkg);
sad_ha = sad_pkg_ha(pkg);
if (sad_ha)
ch_add = 4;
edac_dbg(0, "SAD interleave package: %d = CPU socket %d, HA %d\n",
idx, *socket, sad_ha);
}
*ha = sad_ha;
/*
* Move to the proper node structure, in order to access the
* right PCI registers
@ -1346,7 +1358,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
}
*channel_mask = 1 << base_ch;
pci_read_config_dword(pvt->pci_tad[base_ch],
pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
tad_ch_nilv_offset[n_tads],
&tad_offset);
@ -1405,7 +1417,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
* Step 3) Decode rank
*/
for (n_rir = 0; n_rir < MAX_RIR_RANGES; n_rir++) {
pci_read_config_dword(pvt->pci_tad[base_ch],
pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
rir_way_limit[n_rir],
&reg);
@ -1435,7 +1447,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
idx = (ch_addr >> 13); /* FIXME: Datasheet says to shift by 15 */
idx %= 1 << rir_way;
pci_read_config_dword(pvt->pci_tad[base_ch],
pci_read_config_dword(pvt->pci_tad[ch_add + base_ch],
rir_offset[n_rir][idx],
&reg);
*rank = RIR_RNK_TGT(reg);
@ -1681,16 +1693,9 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
struct sbridge_dev *sbridge_dev)
{
struct sbridge_pvt *pvt = mci->pvt_info;
struct pci_dev *pdev, *tmp;
struct pci_dev *pdev;
u8 saw_chan_mask = 0;
int i;
bool mode_2ha = false;
tmp = pci_get_device(PCI_VENDOR_ID_INTEL,
PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1, NULL);
if (tmp) {
mode_2ha = true;
pci_dev_put(tmp);
}
for (i = 0; i < sbridge_dev->n_devs; i++) {
pdev = sbridge_dev->pdev[i];
@ -1706,26 +1711,21 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_RAS:
pvt->pci_ras = pdev;
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3:
/* if we have 2 HAs active, channels 2 and 3
* are in other device */
if (mode_2ha)
break;
/* fall through */
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD1:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD2:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD3:
{
int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TAD0;
pvt->pci_tad[id] = pdev;
saw_chan_mask |= 1 << id;
}
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_2HA_DDRIO0:
pvt->pci_ddrio = pdev;
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_1HA_DDRIO0:
if (!mode_2ha)
pvt->pci_ddrio = pdev;
pvt->pci_ddrio = pdev;
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_SAD:
pvt->pci_sad0 = pdev;
@ -1741,13 +1741,12 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
break;
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD1:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD2:
case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD3:
{
int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 + 2;
/* we shouldn't have this device if we have just one
* HA present */
WARN_ON(!mode_2ha);
int id = pdev->device - PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA1_TAD0 + 4;
pvt->pci_tad[id] = pdev;
saw_chan_mask |= 1 << id;
}
break;
default:
@ -1766,10 +1765,10 @@ static int ibridge_mci_bind_devs(struct mem_ctl_info *mci,
!pvt->pci_ta)
goto enodev;
for (i = 0; i < NUM_CHANNELS; i++) {
if (!pvt->pci_tad[i])
goto enodev;
}
if (saw_chan_mask != 0x0f && /* -EN */
saw_chan_mask != 0x33 && /* -EP */
saw_chan_mask != 0xff) /* -EX */
goto enodev;
return 0;
enodev:
@ -1787,16 +1786,9 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
struct sbridge_dev *sbridge_dev)
{
struct sbridge_pvt *pvt = mci->pvt_info;
struct pci_dev *pdev, *tmp;
struct pci_dev *pdev;
u8 saw_chan_mask = 0;
int i;
bool mode_2ha = false;
tmp = pci_get_device(PCI_VENDOR_ID_INTEL,
PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1, NULL);
if (tmp) {
mode_2ha = true;
pci_dev_put(tmp);
}
/* there's only one device per system; not tied to any bus */
if (pvt->info.pci_vtd == NULL)
@ -1827,18 +1819,26 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
pvt->pci_ras = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0:
pvt->pci_tad[0] = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD1:
pvt->pci_tad[1] = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD2:
if (!mode_2ha)
pvt->pci_tad[2] = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD3:
if (!mode_2ha)
pvt->pci_tad[3] = pdev;
{
int id = pdev->device - PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0_TAD0;
pvt->pci_tad[id] = pdev;
saw_chan_mask |= 1 << id;
}
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD2:
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD3:
{
int id = pdev->device - PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0 + 4;
pvt->pci_tad[id] = pdev;
saw_chan_mask |= 1 << id;
}
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_DDRIO0:
pvt->pci_ddrio = pdev;
@ -1849,14 +1849,6 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TA:
pvt->pci_ha1_ta = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD0:
if (mode_2ha)
pvt->pci_tad[2] = pdev;
break;
case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA1_TAD1:
if (mode_2ha)
pvt->pci_tad[3] = pdev;
break;
default:
break;
}
@ -1872,10 +1864,10 @@ static int haswell_mci_bind_devs(struct mem_ctl_info *mci,
!pvt->pci_ras || !pvt->pci_ta || !pvt->info.pci_vtd)
goto enodev;
for (i = 0; i < NUM_CHANNELS; i++) {
if (!pvt->pci_tad[i])
goto enodev;
}
if (saw_chan_mask != 0x0f && /* -EN */
saw_chan_mask != 0x33 && /* -EP */
saw_chan_mask != 0xff) /* -EX */
goto enodev;
return 0;
enodev:
@ -1986,7 +1978,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
u32 channel = GET_BITFIELD(m->status, 0, 3);
u32 optypenum = GET_BITFIELD(m->status, 4, 6);
long channel_mask, first_channel;
u8 rank, socket;
u8 rank, socket, ha;
int rc, dimm;
char *area_type = NULL;
@ -2048,7 +2040,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
if (!GET_BITFIELD(m->status, 58, 58))
return;
rc = get_memory_error_data(mci, m->addr, &socket,
rc = get_memory_error_data(mci, m->addr, &socket, &ha,
&channel_mask, &rank, &area_type, msg);
if (rc < 0)
goto err_parsing;
@ -2080,12 +2072,12 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
channel = first_channel;
snprintf(msg, sizeof(msg),
"%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d",
"%s%s area:%s err_code:%04x:%04x socket:%d ha:%d channel_mask:%ld rank:%d",
overflow ? " OVERFLOW" : "",
(uncorrected_error && recoverable) ? " recoverable" : "",
area_type,
mscod, errcode,
socket,
socket, ha,
channel_mask,
rank);
@ -2099,7 +2091,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
/* Call the helper to output message */
edac_mc_handle_error(tp_event, mci, core_err_cnt,
m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
channel, dimm, -1,
4*ha+channel, dimm, -1,
optype, msg);
return;
err_parsing: