diff options
Diffstat (limited to 'drivers/edac/sb_edac.c')
-rw-r--r-- | drivers/edac/sb_edac.c | 212 |
1 files changed, 78 insertions, 134 deletions
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 123204f8e23..4adaf4b7da9 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -314,8 +314,6 @@ struct sbridge_pvt { struct sbridge_info info; struct sbridge_channel channel[NUM_CHANNELS]; - int csrow_map[NUM_CHANNELS][MAX_DIMMS]; - /* Memory type detection */ bool is_mirrored, is_lockstep, is_close_pg; @@ -487,29 +485,14 @@ static struct pci_dev *get_pdev_slot_func(u8 bus, unsigned slot, } /** - * sbridge_get_active_channels() - gets the number of channels and csrows + * check_if_ecc_is_active() - Checks if ECC is active * bus: Device bus - * @channels: Number of channels that will be returned - * @csrows: Number of csrows found - * - * Since EDAC core needs to know in advance the number of available channels - * and csrows, in order to allocate memory for csrows/channels, it is needed - * to run two similar steps. At the first step, implemented on this function, - * it checks the number of csrows/channels present at one socket, identified - * by the associated PCI bus. - * this is used in order to properly allocate the size of mci components. - * Note: one csrow is one dimm. */ -static int sbridge_get_active_channels(const u8 bus, unsigned *channels, - unsigned *csrows) +static int check_if_ecc_is_active(const u8 bus) { struct pci_dev *pdev = NULL; - int i, j; u32 mcmtr; - *channels = 0; - *csrows = 0; - pdev = get_pdev_slot_func(bus, 15, 0); if (!pdev) { sbridge_printk(KERN_ERR, "Couldn't find PCI device " @@ -523,41 +506,14 @@ static int sbridge_get_active_channels(const u8 bus, unsigned *channels, sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n"); return -ENODEV; } - - for (i = 0; i < NUM_CHANNELS; i++) { - u32 mtr; - - /* Device 15 functions 2 - 5 */ - pdev = get_pdev_slot_func(bus, 15, 2 + i); - if (!pdev) { - sbridge_printk(KERN_ERR, "Couldn't find PCI device " - "%2x.%02d.%d!!!\n", - bus, 15, 2 + i); - return -ENODEV; - } - (*channels)++; - - for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) { - pci_read_config_dword(pdev, mtr_regs[j], &mtr); - debugf1("Bus#%02x channel #%d MTR%d = %x\n", bus, i, j, mtr); - if (IS_DIMM_PRESENT(mtr)) - (*csrows)++; - } - } - - debugf0("Number of active channels: %d, number of active dimms: %d\n", - *channels, *csrows); - return 0; } -static int get_dimm_config(const struct mem_ctl_info *mci) +static int get_dimm_config(struct mem_ctl_info *mci) { struct sbridge_pvt *pvt = mci->pvt_info; - struct csrow_info *csr; + struct dimm_info *dimm; int i, j, banks, ranks, rows, cols, size, npages; - int csrow = 0; - unsigned long last_page = 0; u32 reg; enum edac_type mode; enum mem_type mtype; @@ -616,6 +572,8 @@ static int get_dimm_config(const struct mem_ctl_info *mci) u32 mtr; for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) { + dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, + i, j, 0); pci_read_config_dword(pvt->pci_tad[i], mtr_regs[j], &mtr); debugf4("Channel #%d MTR%d = %x\n", i, j, mtr); @@ -634,29 +592,15 @@ static int get_dimm_config(const struct mem_ctl_info *mci) pvt->sbridge_dev->mc, i, j, size, npages, banks, ranks, rows, cols); - csr = &mci->csrows[csrow]; - - csr->first_page = last_page; - csr->last_page = last_page + npages - 1; - csr->page_mask = 0UL; /* Unused */ - csr->nr_pages = npages; - csr->grain = 32; - csr->csrow_idx = csrow; - csr->dtype = (banks == 8) ? DEV_X8 : DEV_X4; - csr->ce_count = 0; - csr->ue_count = 0; - csr->mtype = mtype; - csr->edac_mode = mode; - csr->nr_channels = 1; - csr->channels[0].chan_idx = i; - csr->channels[0].ce_count = 0; - pvt->csrow_map[i][j] = csrow; - snprintf(csr->channels[0].label, - sizeof(csr->channels[0].label), + + dimm->nr_pages = npages; + dimm->grain = 32; + dimm->dtype = (banks == 8) ? DEV_X8 : DEV_X4; + dimm->mtype = mtype; + dimm->edac_mode = mode; + snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_Channel#%u_DIMM#%u", pvt->sbridge_dev->source_id, i, j); - last_page += npages; - csrow++; } } } @@ -844,11 +788,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci, u8 *socket, long *channel_mask, u8 *rank, - char *area_type) + char **area_type, char *msg) { struct mem_ctl_info *new_mci; struct sbridge_pvt *pvt = mci->pvt_info; - char msg[256]; int n_rir, n_sads, n_tads, sad_way, sck_xch; int sad_interl, idx, base_ch; int interleave_mode; @@ -870,12 +813,10 @@ static int get_memory_error_data(struct mem_ctl_info *mci, */ if ((addr > (u64) pvt->tolm) && (addr < (1LL << 32))) { sprintf(msg, "Error at TOLM area, on addr 0x%08Lx", addr); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } if (addr >= (u64)pvt->tohm) { sprintf(msg, "Error at MMIOH area, on addr 0x%016Lx", addr); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } @@ -892,7 +833,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, limit = SAD_LIMIT(reg); if (limit <= prv) { sprintf(msg, "Can't discover the memory socket"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } if (addr <= limit) @@ -901,10 +841,9 @@ static int get_memory_error_data(struct mem_ctl_info *mci, } if (n_sads == MAX_SAD) { sprintf(msg, "Can't discover the memory socket"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } - area_type = get_dram_attr(reg); + *area_type = get_dram_attr(reg); interleave_mode = INTERLEAVE_MODE(reg); pci_read_config_dword(pvt->pci_sad0, interleave_list[n_sads], @@ -942,7 +881,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, break; default: sprintf(msg, "Can't discover socket interleave"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } *socket = sad_interleave[idx]; @@ -957,7 +895,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, if (!new_mci) { sprintf(msg, "Struct for socket #%u wasn't initialized", *socket); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } mci = new_mci; @@ -973,7 +910,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, limit = TAD_LIMIT(reg); if (limit <= prv) { sprintf(msg, "Can't discover the memory channel"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } if (addr <= limit) @@ -1013,7 +949,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, break; default: sprintf(msg, "Can't discover the TAD target"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } *channel_mask = 1 << base_ch; @@ -1027,7 +962,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, break; default: sprintf(msg, "Invalid mirror set. Can't decode addr"); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } } else @@ -1055,7 +989,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, if (offset > addr) { sprintf(msg, "Can't calculate ch addr: TAD offset 0x%08Lx is too high for addr 0x%08Lx!", offset, addr); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } addr -= offset; @@ -1095,7 +1028,6 @@ static int get_memory_error_data(struct mem_ctl_info *mci, if (n_rir == MAX_RIR_RANGES) { sprintf(msg, "Can't discover the memory rank for ch addr 0x%08Lx", ch_addr); - edac_mc_handle_ce_no_info(mci, msg); return -EINVAL; } rir_way = RIR_WAY(reg); @@ -1409,7 +1341,8 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, { struct mem_ctl_info *new_mci; struct sbridge_pvt *pvt = mci->pvt_info; - char *type, *optype, *msg, *recoverable_msg; + enum hw_event_mc_err_type tp_event; + char *type, *optype, msg[256]; bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0); bool overflow = GET_BITFIELD(m->status, 62, 62); bool uncorrected_error = GET_BITFIELD(m->status, 61, 61); @@ -1421,13 +1354,21 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, u32 optypenum = GET_BITFIELD(m->status, 4, 6); long channel_mask, first_channel; u8 rank, socket; - int csrow, rc, dimm; - char *area_type = "Unknown"; - - if (ripv) - type = "NON_FATAL"; - else - type = "FATAL"; + int rc, dimm; + char *area_type = NULL; + + if (uncorrected_error) { + if (ripv) { + type = "FATAL"; + tp_event = HW_EVENT_ERR_FATAL; + } else { + type = "NON_FATAL"; + tp_event = HW_EVENT_ERR_UNCORRECTED; + } + } else { + type = "CORRECTED"; + tp_event = HW_EVENT_ERR_CORRECTED; + } /* * According with Table 15-9 of the Intel Architecture spec vol 3A, @@ -1445,19 +1386,19 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, } else { switch (optypenum) { case 0: - optype = "generic undef request"; + optype = "generic undef request error"; break; case 1: - optype = "memory read"; + optype = "memory read error"; break; case 2: - optype = "memory write"; + optype = "memory write error"; break; case 3: - optype = "addr/cmd"; + optype = "addr/cmd error"; break; case 4: - optype = "memory scrubbing"; + optype = "memory scrubbing error"; break; default: optype = "reserved"; @@ -1466,13 +1407,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, } rc = get_memory_error_data(mci, m->addr, &socket, - &channel_mask, &rank, area_type); + &channel_mask, &rank, &area_type, msg); if (rc < 0) - return; + goto err_parsing; new_mci = get_mci_for_node_id(socket); if (!new_mci) { - edac_mc_handle_ce_no_info(mci, "Error: socket got corrupted!"); - return; + strcpy(msg, "Error: socket got corrupted!"); + goto err_parsing; } mci = new_mci; pvt = mci->pvt_info; @@ -1486,45 +1427,39 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, else dimm = 2; - csrow = pvt->csrow_map[first_channel][dimm]; - - if (uncorrected_error && recoverable) - recoverable_msg = " recoverable"; - else - recoverable_msg = ""; /* - * FIXME: What should we do with "channel" information on mcelog? - * Probably, we can just discard it, as the channel information - * comes from the get_memory_error_data() address decoding + * FIXME: On some memory configurations (mirror, lockstep), the + * Memory Controller can't point the error to a single DIMM. The + * EDAC core should be handling the channel mask, in order to point + * to the group of dimm's where the error may be happening. */ - msg = kasprintf(GFP_ATOMIC, - "%d %s error(s): %s on %s area %s%s: cpu=%d Err=%04x:%04x (ch=%d), " - "addr = 0x%08llx => socket=%d, Channel=%ld(mask=%ld), rank=%d\n", - core_err_cnt, - area_type, - optype, - type, - recoverable_msg, - overflow ? "OVERFLOW" : "", - m->cpu, - mscod, errcode, - channel, /* 1111b means not specified */ - (long long) m->addr, - socket, - first_channel, /* This is the real channel on SB */ - channel_mask, - rank); + snprintf(msg, sizeof(msg), + "count:%d%s%s area:%s err_code:%04x:%04x socket:%d channel_mask:%ld rank:%d", + core_err_cnt, + overflow ? " OVERFLOW" : "", + (uncorrected_error && recoverable) ? " recoverable" : "", + area_type, + mscod, errcode, + socket, + channel_mask, + rank); debugf0("%s", msg); + /* FIXME: need support for channel mask */ + /* Call the helper to output message */ - if (uncorrected_error) - edac_mc_handle_fbd_ue(mci, csrow, 0, 0, msg); - else - edac_mc_handle_fbd_ce(mci, csrow, 0, msg); + edac_mc_handle_error(tp_event, mci, + m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, + channel, dimm, -1, + optype, msg, m); + return; +err_parsing: + edac_mc_handle_error(tp_event, mci, 0, 0, 0, + -1, -1, -1, + msg, "", m); - kfree(msg); } /* @@ -1683,16 +1618,25 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev) static int sbridge_register_mci(struct sbridge_dev *sbridge_dev) { struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; struct sbridge_pvt *pvt; - int rc, channels, csrows; + int rc; /* Check the number of active and not disabled channels */ - rc = sbridge_get_active_channels(sbridge_dev->bus, &channels, &csrows); + rc = check_if_ecc_is_active(sbridge_dev->bus); if (unlikely(rc < 0)) return rc; /* allocate a new MC control structure */ - mci = edac_mc_alloc(sizeof(*pvt), csrows, channels, sbridge_dev->mc); + layers[0].type = EDAC_MC_LAYER_CHANNEL; + layers[0].size = NUM_CHANNELS; + layers[0].is_virt_csrow = false; + layers[1].type = EDAC_MC_LAYER_SLOT; + layers[1].size = MAX_DIMMS; + layers[1].is_virt_csrow = true; + mci = edac_mc_alloc(sbridge_dev->mc, ARRAY_SIZE(layers), layers, + sizeof(*pvt)); + if (unlikely(!mci)) return -ENOMEM; |