summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/edac/i7core_edac.c208
1 files changed, 178 insertions, 30 deletions
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 87d5695f5fb..4758c208f39 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -73,6 +73,18 @@
#define DIMM1_COR_ERR(r) (((r) >> 16) & 0x7fff)
#define DIMM0_COR_ERR(r) ((r) & 0x7fff)
+/* OFFSETS for Device 3 Function 2, as inicated on Xeon 5500 datasheet */
+#define MC_COR_ECC_CNT_0 0x80
+#define MC_COR_ECC_CNT_1 0x84
+#define MC_COR_ECC_CNT_2 0x88
+#define MC_COR_ECC_CNT_3 0x8c
+#define MC_COR_ECC_CNT_4 0x90
+#define MC_COR_ECC_CNT_5 0x94
+
+#define DIMM_TOP_COR_ERR(r) (((r) >> 16) & 0x7fff)
+#define DIMM_BOT_COR_ERR(r) ((r) & 0x7fff)
+
+
/* OFFSETS for Devices 4,5 and 6 Function 0 */
#define MC_CHANNEL_DIMM_INIT_PARAMS 0x58
@@ -194,13 +206,20 @@ struct i7core_pvt {
struct i7core_inject inject;
struct i7core_channel channel[NUM_SOCKETS][NUM_CHANS];
+ unsigned int is_registered:1; /* true if all memories are RDIMMs */
+
int sockets; /* Number of sockets */
int channels; /* Number of active channels */
int ce_count_available[NUM_SOCKETS];
- /* ECC corrected errors counts per dimm */
- unsigned long ce_count[NUM_SOCKETS][MAX_DIMMS];
- int last_ce_count[NUM_SOCKETS][MAX_DIMMS];
+ int csrow_map[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
+
+ /* ECC corrected errors counts per udimm */
+ unsigned long udimm_ce_count[NUM_SOCKETS][MAX_DIMMS];
+ int udimm_last_ce_count[NUM_SOCKETS][MAX_DIMMS];
+ /* ECC corrected errors counts per rdimm */
+ unsigned long rdimm_ce_count[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
+ int rdimm_last_ce_count[NUM_SOCKETS][NUM_CHANS][MAX_DIMMS];
/* mcelog glue */
struct edac_mce edac_mce;
@@ -471,6 +490,8 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
numrow(pvt->info.max_dod >> 6),
numcol(pvt->info.max_dod >> 9));
+ pvt->is_registered = 1;
+
for (i = 0; i < NUM_CHANS; i++) {
u32 data, dimm_dod[3], value[8];
@@ -492,8 +513,14 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
if (data & REGISTERED_DIMM)
mtype = MEM_RDDR3;
- else
+ else {
mtype = MEM_DDR3;
+ /*
+ * FIXME: Currently, the driver will use dev 3:2
+ * counter registers only if all memories are registered
+ */
+ pvt->is_registered = 0;
+ }
#if 0
if (data & THREE_DIMMS_PRESENT)
pvt->channel[i].dimms = 3;
@@ -562,6 +589,8 @@ static int get_dimm_config(struct mem_ctl_info *mci, int *csrow, u8 socket)
csr->channels[0].chan_idx = i;
csr->channels[0].ce_count = 0;
+ pvt->csrow_map[socket][i][j] = *csrow;
+
switch (banks) {
case 4:
csr->dtype = DEV_X4;
@@ -1031,19 +1060,31 @@ static ssize_t i7core_inject_enable_show(struct mem_ctl_info *mci,
static ssize_t i7core_ce_regs_show(struct mem_ctl_info *mci, char *data)
{
- unsigned i, count, total = 0;
+ unsigned i, j, count, total = 0;
struct i7core_pvt *pvt = mci->pvt_info;
for (i = 0; i < pvt->sockets; i++) {
- if (!pvt->ce_count_available[i])
+ if (!pvt->ce_count_available[i]) {
count = sprintf(data, "socket 0 data unavailable\n");
- else
+ continue;
+ }
+ if (!pvt->is_registered)
count = sprintf(data, "socket %d, dimm0: %lu\n"
"dimm1: %lu\ndimm2: %lu\n",
i,
- pvt->ce_count[i][0],
- pvt->ce_count[i][1],
- pvt->ce_count[i][2]);
+ pvt->udimm_ce_count[i][0],
+ pvt->udimm_ce_count[i][1],
+ pvt->udimm_ce_count[i][2]);
+ else
+ for (j = 0; j < NUM_CHANS; j++) {
+ count = sprintf(data, "socket %d, channel %d"
+ "dimm0: %lu\n"
+ "dimm1: %lu\ndimm2: %lu\n",
+ i, j,
+ pvt->rdimm_ce_count[i][j][0],
+ pvt->rdimm_ce_count[i][j][1],
+ pvt->rdimm_ce_count[i][j][2]);
+ }
data += count;
total += count;
}
@@ -1308,6 +1349,103 @@ error:
/****************************************************************************
Error check routines
****************************************************************************/
+static void i7core_rdimm_update_csrow(struct mem_ctl_info *mci, int socket,
+ int chan, int dimm, int add)
+{
+ char *msg;
+ struct i7core_pvt *pvt = mci->pvt_info;
+ int row = pvt->csrow_map[socket][chan][dimm], i;
+
+ for (i = 0; i < add; i++) {
+ msg = kasprintf(GFP_KERNEL, "Corrected error "
+ "(Socket=%d channel=%d dimm=%d",
+ socket, chan, dimm);
+
+ edac_mc_handle_fbd_ce(mci, row, 0, msg);
+ kfree (msg);
+ }
+}
+
+static void i7core_rdimm_update_ce_count(struct mem_ctl_info *mci,
+ int socket, int chan, int new0, int new1, int new2)
+{
+ struct i7core_pvt *pvt = mci->pvt_info;
+ int add0 = 0, add1 = 0, add2 = 0;
+ /* Updates CE counters if it is not the first time here */
+ if (pvt->ce_count_available[socket]) {
+ /* Updates CE counters */
+
+ add2 = new2 - pvt->rdimm_last_ce_count[socket][chan][2];
+ add1 = new1 - pvt->rdimm_last_ce_count[socket][chan][1];
+ add0 = new0 - pvt->rdimm_last_ce_count[socket][chan][0];
+
+ if (add2 < 0)
+ add2 += 0x7fff;
+ pvt->rdimm_ce_count[socket][chan][2] += add2;
+
+ if (add1 < 0)
+ add1 += 0x7fff;
+ pvt->rdimm_ce_count[socket][chan][1] += add1;
+
+ if (add0 < 0)
+ add0 += 0x7fff;
+ pvt->rdimm_ce_count[socket][chan][0] += add0;
+ } else
+ pvt->ce_count_available[socket] = 1;
+
+ /* Store the new values */
+ pvt->rdimm_last_ce_count[socket][chan][2] = new2;
+ pvt->rdimm_last_ce_count[socket][chan][1] = new1;
+ pvt->rdimm_last_ce_count[socket][chan][0] = new0;
+
+ /*updated the edac core */
+ if (add0 != 0)
+ i7core_rdimm_update_csrow(mci, socket, chan, 0, add0);
+ if (add1 != 0)
+ i7core_rdimm_update_csrow(mci, socket, chan, 1, add1);
+ if (add2 != 0)
+ i7core_rdimm_update_csrow(mci, socket, chan, 2, add2);
+
+}
+
+static void i7core_rdimm_check_mc_ecc_err(struct mem_ctl_info *mci, u8 socket)
+{
+ struct i7core_pvt *pvt = mci->pvt_info;
+ u32 rcv[3][2];
+ int i, new0, new1, new2;
+
+ /*Read DEV 3: FUN 2: MC_COR_ECC_CNT regs directly*/
+ pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_0,
+ &rcv[0][0]);
+ pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_1,
+ &rcv[0][1]);
+ pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_2,
+ &rcv[1][0]);
+ pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_3,
+ &rcv[1][1]);
+ pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_4,
+ &rcv[2][0]);
+ pci_read_config_dword(pvt->pci_mcr[socket][2], MC_COR_ECC_CNT_5,
+ &rcv[2][1]);
+ for (i = 0 ; i < 3; i++) {
+ debugf3("MC_COR_ECC_CNT%d = 0x%x; MC_COR_ECC_CNT%d = 0x%x\n",
+ (i * 2), rcv[i][0], (i * 2) + 1, rcv[i][1]);
+ /*if the channel has 3 dimms*/
+ if (pvt->channel[socket][i].dimms > 2) {
+ new0 = DIMM_BOT_COR_ERR(rcv[i][0]);
+ new1 = DIMM_TOP_COR_ERR(rcv[i][0]);
+ new2 = DIMM_BOT_COR_ERR(rcv[i][1]);
+ } else {
+ new0 = DIMM_TOP_COR_ERR(rcv[i][0]) +
+ DIMM_BOT_COR_ERR(rcv[i][0]);
+ new1 = DIMM_TOP_COR_ERR(rcv[i][1]) +
+ DIMM_BOT_COR_ERR(rcv[i][1]);
+ new2 = 0;
+ }
+
+ i7core_rdimm_update_ce_count(mci, socket, i, new0, new1, new2);
+ }
+}
/* This function is based on the device 3 function 4 registers as described on:
* Intel Xeon Processor 5500 Series Datasheet Volume 2
@@ -1315,7 +1453,7 @@ error:
* also available at:
* http://www.arrownac.com/manufacturers/intel/s/nehalem/5500-datasheet-v2.pdf
*/
-static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
+static void i7core_udimm_check_mc_ecc_err(struct mem_ctl_info *mci, u8 socket)
{
struct i7core_pvt *pvt = mci->pvt_info;
u32 rcv1, rcv0;
@@ -1326,7 +1464,7 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
return;
}
- /* Corrected error reads */
+ /* Corrected test errors */
pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV1, &rcv1);
pci_read_config_dword(pvt->pci_mcr[socket][4], MC_TEST_ERR_RCV0, &rcv0);
@@ -1335,39 +1473,38 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
new1 = DIMM1_COR_ERR(rcv0);
new0 = DIMM0_COR_ERR(rcv0);
-#if 0
- debugf2("%s CE rcv1=0x%08x rcv0=0x%08x, %d %d %d\n",
- (pvt->ce_count_available ? "UPDATE" : "READ"),
- rcv1, rcv0, new0, new1, new2);
-#endif
-
/* Updates CE counters if it is not the first time here */
if (pvt->ce_count_available[socket]) {
/* Updates CE counters */
int add0, add1, add2;
- add2 = new2 - pvt->last_ce_count[socket][2];
- add1 = new1 - pvt->last_ce_count[socket][1];
- add0 = new0 - pvt->last_ce_count[socket][0];
+ add2 = new2 - pvt->udimm_last_ce_count[socket][2];
+ add1 = new1 - pvt->udimm_last_ce_count[socket][1];
+ add0 = new0 - pvt->udimm_last_ce_count[socket][0];
if (add2 < 0)
add2 += 0x7fff;
- pvt->ce_count[socket][2] += add2;
+ pvt->udimm_ce_count[socket][2] += add2;
if (add1 < 0)
add1 += 0x7fff;
- pvt->ce_count[socket][1] += add1;
+ pvt->udimm_ce_count[socket][1] += add1;
if (add0 < 0)
add0 += 0x7fff;
- pvt->ce_count[socket][0] += add0;
+ pvt->udimm_ce_count[socket][0] += add0;
+
+ if (add0 | add1 | add2)
+ i7core_printk(KERN_ERR, "New Corrected error(s): "
+ "dimm0: +%d, dimm1: +%d, dimm2 +%d\n",
+ add0, add1, add2);
} else
pvt->ce_count_available[socket] = 1;
/* Store the new values */
- pvt->last_ce_count[socket][2] = new2;
- pvt->last_ce_count[socket][1] = new1;
- pvt->last_ce_count[socket][0] = new0;
+ pvt->udimm_last_ce_count[socket][2] = new2;
+ pvt->udimm_last_ce_count[socket][1] = new1;
+ pvt->udimm_last_ce_count[socket][0] = new0;
}
/*
@@ -1386,6 +1523,7 @@ static void check_mc_test_err(struct mem_ctl_info *mci, u8 socket)
static void i7core_mce_output_error(struct mem_ctl_info *mci,
struct mce *m)
{
+ struct i7core_pvt *pvt = mci->pvt_info;
char *type, *optype, *err, *msg;
unsigned long error = m->status & 0x1ff0000l;
u32 optypenum = (m->status >> 4) & 0x07;
@@ -1394,6 +1532,7 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
u32 channel = (m->misc >> 18) & 0x3;
u32 syndrome = m->misc >> 32;
u32 errnum = find_first_bit(&error, 32);
+ int csrow;
if (m->mcgstatus & 1)
type = "FATAL";
@@ -1463,9 +1602,15 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
debugf0("%s", msg);
+ csrow = pvt->csrow_map[m->cpu][channel][dimm];
+
/* Call the helper to output message */
- edac_mc_handle_fbd_ue(mci, 0 /* FIXME: should be rank here */,
- 0, 0 /* FIXME: should be channel here */, msg);
+ if (m->mcgstatus & 1)
+ edac_mc_handle_fbd_ue(mci, csrow, 0,
+ 0 /* FIXME: should be channel here */, msg);
+ else if (!pvt->is_registered)
+ edac_mc_handle_fbd_ce(mci, csrow,
+ 0 /* FIXME: should be channel here */, msg);
kfree(msg);
}
@@ -1502,7 +1647,10 @@ static void i7core_check_error(struct mem_ctl_info *mci)
/* check memory count errors */
for (i = 0; i < pvt->sockets; i++)
- check_mc_test_err(mci, i);
+ if (!pvt->is_registered)
+ i7core_udimm_check_mc_ecc_err(mci, i);
+ else
+ i7core_rdimm_check_mc_ecc_err(mci, i);
}
/*