From e8f937da744fe13505720a2709c82b182f730fcf Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 18 Nov 2015 14:00:05 +0100 Subject: EDAC, pci: Remove old disabled code Remove an unused edac_pci_find() function iterating over edac_pci_list. Signed-off-by: Borislav Petkov --- drivers/edac/edac_pci.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index 2cf44b4db80c..5034385c47e6 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -178,41 +178,6 @@ static void del_edac_pci_from_global_list(struct edac_pci_ctl_info *pci) INIT_LIST_HEAD(&pci->link); } -#if 0 -/* Older code, but might use in the future */ - -/* - * edac_pci_find() - * Search for an edac_pci_ctl_info structure whose index is 'idx' - * - * If found, return a pointer to the structure - * Else return NULL. - * - * Caller must hold pci_ctls_mutex. - */ -struct edac_pci_ctl_info *edac_pci_find(int idx) -{ - struct list_head *item; - struct edac_pci_ctl_info *pci; - - /* Iterage over list, looking for exact match of ID */ - list_for_each(item, &edac_pci_list) { - pci = list_entry(item, struct edac_pci_ctl_info, link); - - if (pci->pci_idx >= idx) { - if (pci->pci_idx == idx) - return pci; - - /* not on list, so terminate early */ - break; - } - } - - return NULL; -} -EXPORT_SYMBOL_GPL(edac_pci_find); -#endif - /* * edac_pci_workq_function() * -- cgit v1.2.3-58-ga151 From d54051f1cc2af6d12c2478911921d32476319621 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 3 Dec 2015 10:57:12 +0100 Subject: EDAC, mpc85xx: Use platform_register/unregister_drivers() These new helpers simplify implementing multi-driver modules and properly handle failure to register one driver by unregistering all previously registered drivers. Reviewed-by: Johannes Thumshirn Signed-off-by: Thierry Reding Cc: linux-edac Link: http://lkml.kernel.org/r/1449136632-11680-1-git-send-email-thierry.reding@gmail.com Signed-off-by: Borislav Petkov --- drivers/edac/mpc85xx_edac.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 23ef8e9f2c9a..3eab06351089 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -1208,6 +1208,11 @@ static void __init mpc85xx_mc_clear_rfxe(void *data) } #endif +static struct platform_driver * const drivers[] = { + &mpc85xx_mc_err_driver, + &mpc85xx_l2_err_driver, +}; + static int __init mpc85xx_mc_init(void) { int res = 0; @@ -1226,13 +1231,9 @@ static int __init mpc85xx_mc_init(void) break; } - res = platform_driver_register(&mpc85xx_mc_err_driver); - if (res) - printk(KERN_WARNING EDAC_MOD_STR "MC fails to register\n"); - - res = platform_driver_register(&mpc85xx_l2_err_driver); + res = platform_register_drivers(drivers, ARRAY_SIZE(drivers)); if (res) - printk(KERN_WARNING EDAC_MOD_STR "L2 fails to register\n"); + printk(KERN_WARNING EDAC_MOD_STR "drivers fail to register\n"); #ifdef CONFIG_FSL_SOC_BOOKE pvr = mfspr(SPRN_PVR); @@ -1270,8 +1271,7 @@ static void __exit mpc85xx_mc_exit(void) on_each_cpu(mpc85xx_mc_restore_hid1, NULL, 0); } #endif - platform_driver_unregister(&mpc85xx_l2_err_driver); - platform_driver_unregister(&mpc85xx_mc_err_driver); + platform_unregister_drivers(drivers, ARRAY_SIZE(drivers)); } module_exit(mpc85xx_mc_exit); -- cgit v1.2.3-58-ga151 From 768ce42ccee7f04d02d01dd757fb8100b3a6f958 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 2 Dec 2015 17:18:58 +0100 Subject: EDAC, mv64x60: Use platform_register/unregister_drivers() These new helpers simplify implementing multi-driver modules and properly handle failure to register one driver by unregistering all previously registered drivers. Signed-off-by: Thierry Reding Cc: linux-edac Link: http://lkml.kernel.org/r/1449073138-10852-2-git-send-email-thierry.reding@gmail.com Signed-off-by: Borislav Petkov --- drivers/edac/mv64x60_edac.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c index 0574e1bbe45c..6c54127e6eae 100644 --- a/drivers/edac/mv64x60_edac.c +++ b/drivers/edac/mv64x60_edac.c @@ -847,6 +847,15 @@ static struct platform_driver mv64x60_mc_err_driver = { } }; +static struct platform_driver * const drivers[] = { + &mv64x60_mc_err_driver, + &mv64x60_cpu_err_driver, + &mv64x60_sram_err_driver, +#ifdef CONFIG_PCI + &mv64x60_pci_err_driver, +#endif +}; + static int __init mv64x60_edac_init(void) { int ret = 0; @@ -863,39 +872,13 @@ static int __init mv64x60_edac_init(void) break; } - ret = platform_driver_register(&mv64x60_mc_err_driver); - if (ret) - printk(KERN_WARNING EDAC_MOD_STR "MC err failed to register\n"); - - ret = platform_driver_register(&mv64x60_cpu_err_driver); - if (ret) - printk(KERN_WARNING EDAC_MOD_STR - "CPU err failed to register\n"); - - ret = platform_driver_register(&mv64x60_sram_err_driver); - if (ret) - printk(KERN_WARNING EDAC_MOD_STR - "SRAM err failed to register\n"); - -#ifdef CONFIG_PCI - ret = platform_driver_register(&mv64x60_pci_err_driver); - if (ret) - printk(KERN_WARNING EDAC_MOD_STR - "PCI err failed to register\n"); -#endif - - return ret; + return platform_register_drivers(drivers, ARRAY_SIZE(drivers)); } module_init(mv64x60_edac_init); static void __exit mv64x60_edac_exit(void) { -#ifdef CONFIG_PCI - platform_driver_unregister(&mv64x60_pci_err_driver); -#endif - platform_driver_unregister(&mv64x60_sram_err_driver); - platform_driver_unregister(&mv64x60_cpu_err_driver); - platform_driver_unregister(&mv64x60_mc_err_driver); + platform_unregister_drivers(drivers, ARRAY_SIZE(drivers)); } module_exit(mv64x60_edac_exit); -- cgit v1.2.3-58-ga151 From c59f9c06bdbdaeff9da107d2fcec4f46e9b10825 Mon Sep 17 00:00:00 2001 From: Jim Snow Date: Thu, 3 Dec 2015 10:48:52 +0100 Subject: EDAC, sb_edac: Virtualize several hard-coded functions SAD limit, interleave mode and DRAM related functionalities are now virtualized, so that overriding them is easier. Signed-off-by: Jim Snow Acked-by: Tony Luck Cc: Mauro Carvalho Chehab Cc: linux-edac Cc: lukasz.anaczkowski@intel.com Link: http://lkml.kernel.org/r/1449136134-23706-3-git-send-email-hubert.chrzaniuk@intel.com [ Rebase to 4.4-rc3. ] Signed-off-by: Hubert Chrzaniuk Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 59 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 11 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 429309c62699..2e50a3eeeac2 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -65,15 +65,12 @@ static const u32 ibridge_dram_rule[] = { 0xd8, 0xe0, 0xe8, 0xf0, 0xf8, }; -#define SAD_LIMIT(reg) ((GET_BITFIELD(reg, 6, 25) << 26) | 0x3ffffff) -#define DRAM_ATTR(reg) GET_BITFIELD(reg, 2, 3) -#define INTERLEAVE_MODE(reg) GET_BITFIELD(reg, 1, 1) #define DRAM_RULE_ENABLE(reg) GET_BITFIELD(reg, 0, 0) #define A7MODE(reg) GET_BITFIELD(reg, 26, 26) -static char *get_dram_attr(u32 reg) +static char *show_dram_attr(u32 attr) { - switch(DRAM_ATTR(reg)) { + switch (attr) { case 0: return "DRAM"; case 1: @@ -273,6 +270,10 @@ struct sbridge_info { u64 (*get_tolm)(struct sbridge_pvt *pvt); u64 (*get_tohm)(struct sbridge_pvt *pvt); u64 (*rir_limit)(u32 reg); + u64 (*sad_limit)(u32 reg); + u32 (*interleave_mode)(u32 reg); + char* (*show_interleave_mode)(u32 reg); + u32 (*dram_attr)(u32 reg); const u32 *dram_rule; const u32 *interleave_list; const struct interleave_pkg *interleave_pkg; @@ -718,6 +719,26 @@ static u64 rir_limit(u32 reg) return ((u64)GET_BITFIELD(reg, 1, 10) << 29) | 0x1fffffff; } +static u64 sad_limit(u32 reg) +{ + return (GET_BITFIELD(reg, 6, 25) << 26) | 0x3ffffff; +} + +static u32 interleave_mode(u32 reg) +{ + return GET_BITFIELD(reg, 1, 1); +} + +char *show_interleave_mode(u32 reg) +{ + return interleave_mode(reg) ? "8:6" : "[8:6]XOR[18:16]"; +} + +static u32 dram_attr(u32 reg) +{ + return GET_BITFIELD(reg, 2, 3); +} + static enum mem_type get_memory_type(struct sbridge_pvt *pvt) { u32 reg; @@ -1069,7 +1090,7 @@ static void get_memory_layout(const struct mem_ctl_info *mci) /* SAD_LIMIT Address range is 45:26 */ pci_read_config_dword(pvt->pci_sad0, pvt->info.dram_rule[n_sads], ®); - limit = SAD_LIMIT(reg); + limit = pvt->info.sad_limit(reg); if (!DRAM_RULE_ENABLE(reg)) continue; @@ -1081,10 +1102,10 @@ static void get_memory_layout(const struct mem_ctl_info *mci) gb = div_u64_rem(tmp_mb, 1024, &mb); edac_dbg(0, "SAD#%d %s up to %u.%03u GB (0x%016Lx) Interleave: %s reg=0x%08x\n", n_sads, - get_dram_attr(reg), + show_dram_attr(pvt->info.dram_attr(reg)), gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, - INTERLEAVE_MODE(reg) ? "8:6" : "[8:6]XOR[18:16]", + pvt->info.show_interleave_mode(reg), reg); prv = limit; @@ -1248,7 +1269,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, if (!DRAM_RULE_ENABLE(reg)) continue; - limit = SAD_LIMIT(reg); + limit = pvt->info.sad_limit(reg); if (limit <= prv) { sprintf(msg, "Can't discover the memory socket"); return -EINVAL; @@ -1262,8 +1283,8 @@ static int get_memory_error_data(struct mem_ctl_info *mci, return -EINVAL; } dram_rule = reg; - *area_type = get_dram_attr(dram_rule); - interleave_mode = INTERLEAVE_MODE(dram_rule); + *area_type = show_dram_attr(pvt->info.dram_attr(dram_rule)); + interleave_mode = pvt->info.interleave_mode(dram_rule); pci_read_config_dword(pvt->pci_sad0, pvt->info.interleave_list[n_sads], ®); @@ -2401,6 +2422,10 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.get_memory_type = get_memory_type; pvt->info.get_node_id = get_node_id; pvt->info.rir_limit = rir_limit; + pvt->info.sad_limit = sad_limit; + pvt->info.interleave_mode = interleave_mode; + pvt->info.show_interleave_mode = show_interleave_mode; + pvt->info.dram_attr = dram_attr; pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule); pvt->info.interleave_list = ibridge_interleave_list; pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list); @@ -2421,6 +2446,10 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.get_memory_type = get_memory_type; pvt->info.get_node_id = get_node_id; pvt->info.rir_limit = rir_limit; + pvt->info.sad_limit = sad_limit; + pvt->info.interleave_mode = interleave_mode; + pvt->info.show_interleave_mode = show_interleave_mode; + pvt->info.dram_attr = dram_attr; pvt->info.max_sad = ARRAY_SIZE(sbridge_dram_rule); pvt->info.interleave_list = sbridge_interleave_list; pvt->info.max_interleave = ARRAY_SIZE(sbridge_interleave_list); @@ -2441,6 +2470,10 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.get_memory_type = haswell_get_memory_type; pvt->info.get_node_id = haswell_get_node_id; pvt->info.rir_limit = haswell_rir_limit; + pvt->info.sad_limit = sad_limit; + pvt->info.interleave_mode = interleave_mode; + pvt->info.show_interleave_mode = show_interleave_mode; + pvt->info.dram_attr = dram_attr; pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule); pvt->info.interleave_list = ibridge_interleave_list; pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list); @@ -2461,6 +2494,10 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.get_memory_type = haswell_get_memory_type; pvt->info.get_node_id = haswell_get_node_id; pvt->info.rir_limit = haswell_rir_limit; + pvt->info.sad_limit = sad_limit; + pvt->info.interleave_mode = interleave_mode; + pvt->info.show_interleave_mode = show_interleave_mode; + pvt->info.dram_attr = dram_attr; pvt->info.max_sad = ARRAY_SIZE(ibridge_dram_rule); pvt->info.interleave_list = ibridge_interleave_list; pvt->info.max_interleave = ARRAY_SIZE(ibridge_interleave_list); -- cgit v1.2.3-58-ga151 From c1979ba254810a710bfdc982e3d417a4a7369c31 Mon Sep 17 00:00:00 2001 From: Jim Snow Date: Thu, 3 Dec 2015 10:48:53 +0100 Subject: EDAC, sb_edac: Add support for duplicate device IDs Add options to sbridge_get_all_devices() to allow for duplicate device IDs and devices that are scattered across mulitple PCI buses. Signed-off-by: Jim Snow Acked-by: Tony Luck Cc: Mauro Carvalho Chehab Cc: linux-edac Cc: lukasz.anaczkowski@intel.com Link: http://lkml.kernel.org/r/1449136134-23706-4-git-send-email-hubert.chrzaniuk@intel.com [ Rebase to 4.4-rc3. ] Signed-off-by: Hubert Chrzaniuk Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 2e50a3eeeac2..c8fbde2bd20a 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -637,10 +637,19 @@ static inline int numcol(u32 mtr) return 1 << cols; } -static struct sbridge_dev *get_sbridge_dev(u8 bus) +static struct sbridge_dev *get_sbridge_dev(u8 bus, int multi_bus) { struct sbridge_dev *sbridge_dev; + /* + * If we have devices scattered across several busses that pertain + * to the same memory controller, we'll lump them all together. + */ + if (multi_bus) { + return list_first_entry_or_null(&sbridge_edac_list, + struct sbridge_dev, list); + } + list_for_each_entry(sbridge_dev, &sbridge_edac_list, list) { if (sbridge_dev->bus == bus) return sbridge_dev; @@ -1588,7 +1597,8 @@ static void sbridge_put_all_devices(void) static int sbridge_get_onedevice(struct pci_dev **prev, u8 *num_mc, const struct pci_id_table *table, - const unsigned devno) + const unsigned devno, + const int multi_bus) { struct sbridge_dev *sbridge_dev; const struct pci_id_descr *dev_descr = &table->descr[devno]; @@ -1624,7 +1634,7 @@ static int sbridge_get_onedevice(struct pci_dev **prev, } bus = pdev->bus->number; - sbridge_dev = get_sbridge_dev(bus); + sbridge_dev = get_sbridge_dev(bus, multi_bus); if (!sbridge_dev) { sbridge_dev = alloc_sbridge_dev(bus, table); if (!sbridge_dev) { @@ -1673,21 +1683,32 @@ static int sbridge_get_onedevice(struct pci_dev **prev, * @num_mc: pointer to the memory controllers count, to be incremented in case * of success. * @table: model specific table + * @allow_dups: allow for multiple devices to exist with the same device id + * (as implemented, this isn't expected to work correctly in the + * multi-socket case). + * @multi_bus: don't assume devices on different buses belong to different + * memory controllers. * * returns 0 in case of success or error code */ -static int sbridge_get_all_devices(u8 *num_mc, - const struct pci_id_table *table) +static int sbridge_get_all_devices_full(u8 *num_mc, + const struct pci_id_table *table, + int allow_dups, + int multi_bus) { int i, rc; struct pci_dev *pdev = NULL; while (table && table->descr) { for (i = 0; i < table->n_devs; i++) { - pdev = NULL; + if (!allow_dups || i == 0 || + table->descr[i].dev_id != + table->descr[i-1].dev_id) { + pdev = NULL; + } do { rc = sbridge_get_onedevice(&pdev, num_mc, - table, i); + table, i, multi_bus); if (rc < 0) { if (i == 0) { i = table->n_devs; @@ -1696,7 +1717,7 @@ static int sbridge_get_all_devices(u8 *num_mc, sbridge_put_all_devices(); return -ENODEV; } - } while (pdev); + } while (pdev && !allow_dups); } table++; } @@ -1704,6 +1725,9 @@ static int sbridge_get_all_devices(u8 *num_mc, return 0; } +#define sbridge_get_all_devices(num_mc, table) \ + sbridge_get_all_devices_full(num_mc, table, 0, 0) + static int sbridge_mci_bind_devs(struct mem_ctl_info *mci, struct sbridge_dev *sbridge_dev) { -- cgit v1.2.3-58-ga151 From d0cdf9003140e9b40d2488aaee2838babe7e212c Mon Sep 17 00:00:00 2001 From: Jim Snow Date: Thu, 3 Dec 2015 10:48:54 +0100 Subject: EDAC, sb_edac: Add Knights Landing (Xeon Phi gen 2) support Knights Landing is the next generation architecture for HPC market. KNL introduces concept of a tile and CHA - Cache/Home Agent for memory accesses. Some things are fixed in KNL: () There's single DIMM slot per channel () There's 2 memory controllers with 3 channels each, however, from EDAC standpoint, it is presented as single memory controller with 6 channels. In order to represent 2 MCs w/ 3 CH, it would require major redesign of EDAC core driver. Basically, two functionalities are added/extended: () during driver initialization KNL topology is being recognized, i.e. which channels are populated with what DIMM sizes (knl_get_dimm_capacity function) () handle MCE errors - channel swizzling Reviewed-by: Tony Luck Signed-off-by: Jim Snow Cc: Mauro Carvalho Chehab Cc: linux-edac Cc: lukasz.anaczkowski@intel.com Link: http://lkml.kernel.org/r/1449136134-23706-5-git-send-email-hubert.chrzaniuk@intel.com [ Rebase to 4.4-rc3. ] Signed-off-by: Hubert Chrzaniuk Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 966 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 921 insertions(+), 45 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index c8fbde2bd20a..b3d924da5985 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -65,6 +65,14 @@ static const u32 ibridge_dram_rule[] = { 0xd8, 0xe0, 0xe8, 0xf0, 0xf8, }; +static const u32 knl_dram_rule[] = { + 0x60, 0x68, 0x70, 0x78, 0x80, /* 0-4 */ + 0x88, 0x90, 0x98, 0xa0, 0xa8, /* 5-9 */ + 0xb0, 0xb8, 0xc0, 0xc8, 0xd0, /* 10-14 */ + 0xd8, 0xe0, 0xe8, 0xf0, 0xf8, /* 15-19 */ + 0x100, 0x108, 0x110, 0x118, /* 20-23 */ +}; + #define DRAM_RULE_ENABLE(reg) GET_BITFIELD(reg, 0, 0) #define A7MODE(reg) GET_BITFIELD(reg, 26, 26) @@ -94,6 +102,14 @@ static const u32 ibridge_interleave_list[] = { 0xdc, 0xe4, 0xec, 0xf4, 0xfc, }; +static const u32 knl_interleave_list[] = { + 0x64, 0x6c, 0x74, 0x7c, 0x84, /* 0-4 */ + 0x8c, 0x94, 0x9c, 0xa4, 0xac, /* 5-9 */ + 0xb4, 0xbc, 0xc4, 0xcc, 0xd4, /* 10-14 */ + 0xdc, 0xe4, 0xec, 0xf4, 0xfc, /* 15-19 */ + 0x104, 0x10c, 0x114, 0x11c, /* 20-23 */ +}; + struct interleave_pkg { unsigned char start; unsigned char end; @@ -131,10 +147,13 @@ static inline int sad_pkg(const struct interleave_pkg *table, u32 reg, /* Devices 12 Function 7 */ #define TOLM 0x80 -#define TOHM 0x84 +#define TOHM 0x84 #define HASWELL_TOLM 0xd0 #define HASWELL_TOHM_0 0xd4 #define HASWELL_TOHM_1 0xd8 +#define KNL_TOLM 0xd0 +#define KNL_TOHM_0 0xd4 +#define KNL_TOHM_1 0xd8 #define GET_TOLM(reg) ((GET_BITFIELD(reg, 0, 3) << 28) | 0x3ffffff) #define GET_TOHM(reg) ((GET_BITFIELD(reg, 0, 20) << 25) | 0x3ffffff) @@ -145,6 +164,8 @@ static inline int sad_pkg(const struct interleave_pkg *table, u32 reg, #define SOURCE_ID(reg) GET_BITFIELD(reg, 9, 11) +#define SOURCE_ID_KNL(reg) GET_BITFIELD(reg, 12, 14) + #define SAD_CONTROL 0xf4 /* Device 14 function 0 */ @@ -167,6 +188,7 @@ static const u32 tad_dram_rule[] = { /* Device 15, function 0 */ #define MCMTR 0x7c +#define KNL_MCMTR 0x624 #define IS_ECC_ENABLED(mcmtr) GET_BITFIELD(mcmtr, 2, 2) #define IS_LOCKSTEP_ENABLED(mcmtr) GET_BITFIELD(mcmtr, 1, 1) @@ -183,6 +205,8 @@ static const int mtr_regs[] = { 0x80, 0x84, 0x88, }; +static const int knl_mtr_reg = 0xb60; + #define RANK_DISABLE(mtr) GET_BITFIELD(mtr, 16, 19) #define IS_DIMM_PRESENT(mtr) GET_BITFIELD(mtr, 14, 14) #define RANK_CNT_BITS(mtr) GET_BITFIELD(mtr, 12, 13) @@ -253,6 +277,9 @@ static const u32 correrrthrsld[] = { #define NUM_CHANNELS 8 /* 2MC per socket, four chan per MC */ #define MAX_DIMMS 3 /* Max DIMMS per channel */ +#define KNL_MAX_CHAS 38 /* KNL max num. of Cache Home Agents */ +#define KNL_MAX_CHANNELS 6 /* KNL max num. of PCI channels */ +#define KNL_MAX_EDCS 8 /* Embedded DRAM controllers */ #define CHANNEL_UNSPECIFIED 0xf /* Intel IA32 SDM 15-14 */ enum type { @@ -260,6 +287,7 @@ enum type { IVY_BRIDGE, HASWELL, BROADWELL, + KNIGHTS_LANDING, }; struct sbridge_pvt; @@ -309,6 +337,16 @@ struct sbridge_dev { struct mem_ctl_info *mci; }; +struct knl_pvt { + struct pci_dev *pci_cha[KNL_MAX_CHAS]; + struct pci_dev *pci_channel[KNL_MAX_CHANNELS]; + struct pci_dev *pci_mc0; + struct pci_dev *pci_mc1; + struct pci_dev *pci_mc0_misc; + struct pci_dev *pci_mc1_misc; + struct pci_dev *pci_mc_info; /* tolm, tohm */ +}; + struct sbridge_pvt { struct pci_dev *pci_ta, *pci_ddrio, *pci_ras; struct pci_dev *pci_sad0, *pci_sad1; @@ -337,6 +375,7 @@ struct sbridge_pvt { /* Memory description */ u64 tolm, tohm; + struct knl_pvt knl; }; #define PCI_DESCR(device_id, opt) \ @@ -510,6 +549,50 @@ static const struct pci_id_table pci_dev_descr_haswell_table[] = { {0,} /* 0 terminated list. */ }; +/* Knight's Landing Support */ +/* + * KNL's memory channels are swizzled between memory controllers. + * MC0 is mapped to CH3,5,6 and MC1 is mapped to CH0,1,2 + */ +#define knl_channel_remap(channel) ((channel + 3) % 6) + +/* Memory controller, TAD tables, error injection - 2-8-0, 2-9-0 (2 of these) */ +#define PCI_DEVICE_ID_INTEL_KNL_IMC_MC 0x7840 +/* DRAM channel stuff; bank addrs, dimmmtr, etc.. 2-8-2 - 2-9-4 (6 of these) */ +#define PCI_DEVICE_ID_INTEL_KNL_IMC_CHANNEL 0x7843 +/* kdrwdbu TAD limits/offsets, MCMTR - 2-10-1, 2-11-1 (2 of these) */ +#define PCI_DEVICE_ID_INTEL_KNL_IMC_TA 0x7844 +/* CHA broadcast registers, dram rules - 1-29-0 (1 of these) */ +#define PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0 0x782a +/* SAD target - 1-29-1 (1 of these) */ +#define PCI_DEVICE_ID_INTEL_KNL_IMC_SAD1 0x782b +/* Caching / Home Agent */ +#define PCI_DEVICE_ID_INTEL_KNL_IMC_CHA 0x782c +/* Device with TOLM and TOHM, 0-5-0 (1 of these) */ +#define PCI_DEVICE_ID_INTEL_KNL_IMC_TOLHM 0x7810 + +/* + * KNL differs from SB, IB, and Haswell in that it has multiple + * instances of the same device with the same device ID, so we handle that + * by creating as many copies in the table as we expect to find. + * (Like device ID must be grouped together.) + */ + +static const struct pci_id_descr pci_dev_descr_knl[] = { + [0] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0, 0) }, + [1] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_SAD1, 0) }, + [2 ... 3] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_MC, 0)}, + [4 ... 41] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_CHA, 0) }, + [42 ... 47] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_CHANNEL, 0) }, + [48] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_TA, 0) }, + [49] = { PCI_DESCR(PCI_DEVICE_ID_INTEL_KNL_IMC_TOLHM, 0) }, +}; + +static const struct pci_id_table pci_dev_descr_knl_table[] = { + PCI_ID_TABLE_ENTRY(pci_dev_descr_knl), + {0,} +}; + /* * Broadwell support * @@ -586,6 +669,7 @@ static const struct pci_device_id sbridge_pci_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0)}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0)}, + {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0)}, {0,} /* 0 terminated list. */ }; @@ -599,7 +683,7 @@ static inline int numrank(enum type type, u32 mtr) int ranks = (1 << RANK_CNT_BITS(mtr)); int max = 4; - if (type == HASWELL || type == BROADWELL) + if (type == HASWELL || type == BROADWELL || type == KNIGHTS_LANDING) max = 8; if (ranks > max) { @@ -748,6 +832,47 @@ static u32 dram_attr(u32 reg) return GET_BITFIELD(reg, 2, 3); } +static u64 knl_sad_limit(u32 reg) +{ + return (GET_BITFIELD(reg, 7, 26) << 26) | 0x3ffffff; +} + +static u32 knl_interleave_mode(u32 reg) +{ + return GET_BITFIELD(reg, 1, 2); +} + +static char *knl_show_interleave_mode(u32 reg) +{ + char *s; + + switch (knl_interleave_mode(reg)) { + case 0: + s = "use address bits [8:6]"; + break; + case 1: + s = "use address bits [10:8]"; + break; + case 2: + s = "use address bits [14:12]"; + break; + case 3: + s = "use address bits [32:30]"; + break; + default: + WARN_ON(1); + break; + } + + return s; +} + +static u32 dram_attr_knl(u32 reg) +{ + return GET_BITFIELD(reg, 3, 4); +} + + static enum mem_type get_memory_type(struct sbridge_pvt *pvt) { u32 reg; @@ -842,6 +967,12 @@ static enum dev_type broadwell_get_width(struct sbridge_pvt *pvt, u32 mtr) return __ibridge_get_width(GET_BITFIELD(mtr, 8, 9)); } +static enum mem_type knl_get_memory_type(struct sbridge_pvt *pvt) +{ + /* DDR4 RDIMMS and LRDIMMS are supported */ + return MEM_RDDR4; +} + static u8 get_node_id(struct sbridge_pvt *pvt) { u32 reg; @@ -857,6 +988,15 @@ static u8 haswell_get_node_id(struct sbridge_pvt *pvt) return GET_BITFIELD(reg, 0, 3); } +static u8 knl_get_node_id(struct sbridge_pvt *pvt) +{ + u32 reg; + + pci_read_config_dword(pvt->pci_sad1, SAD_CONTROL, ®); + return GET_BITFIELD(reg, 0, 2); +} + + static u64 haswell_get_tolm(struct sbridge_pvt *pvt) { u32 reg; @@ -878,6 +1018,26 @@ static u64 haswell_get_tohm(struct sbridge_pvt *pvt) return rc | 0x1ffffff; } +static u64 knl_get_tolm(struct sbridge_pvt *pvt) +{ + u32 reg; + + pci_read_config_dword(pvt->knl.pci_mc_info, KNL_TOLM, ®); + return (GET_BITFIELD(reg, 26, 31) << 26) | 0x3ffffff; +} + +static u64 knl_get_tohm(struct sbridge_pvt *pvt) +{ + u64 rc; + u32 reg_lo, reg_hi; + + pci_read_config_dword(pvt->knl.pci_mc_info, KNL_TOHM_0, ®_lo); + pci_read_config_dword(pvt->knl.pci_mc_info, KNL_TOHM_1, ®_hi); + rc = ((u64)reg_hi << 32) | reg_lo; + return rc | 0x3ffffff; +} + + static u64 haswell_rir_limit(u32 reg) { return (((u64)GET_BITFIELD(reg, 1, 11) + 1) << 29) - 1; @@ -935,11 +1095,22 @@ static int check_if_ecc_is_active(const u8 bus, enum type type) case BROADWELL: id = PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0_TA; break; + case KNIGHTS_LANDING: + /* + * KNL doesn't group things by bus the same way + * SB/IB/Haswell does. + */ + id = PCI_DEVICE_ID_INTEL_KNL_IMC_TA; + break; default: return -ENODEV; } - pdev = get_pdev_same_bus(bus, id); + if (type != KNIGHTS_LANDING) + pdev = get_pdev_same_bus(bus, id); + else + pdev = pci_get_device(PCI_VENDOR_ID_INTEL, id, 0); + if (!pdev) { sbridge_printk(KERN_ERR, "Couldn't find PCI device " "%04x:%04x! on bus %02d\n", @@ -947,7 +1118,8 @@ static int check_if_ecc_is_active(const u8 bus, enum type type) return -ENODEV; } - pci_read_config_dword(pdev, MCMTR, &mcmtr); + pci_read_config_dword(pdev, + type == KNIGHTS_LANDING ? KNL_MCMTR : MCMTR, &mcmtr); if (!IS_ECC_ENABLED(mcmtr)) { sbridge_printk(KERN_ERR, "ECC is disabled. Aborting\n"); return -ENODEV; @@ -955,6 +1127,476 @@ static int check_if_ecc_is_active(const u8 bus, enum type type) return 0; } +/* Low bits of TAD limit, and some metadata. */ +static const u32 knl_tad_dram_limit_lo[] = { + 0x400, 0x500, 0x600, 0x700, + 0x800, 0x900, 0xa00, 0xb00, +}; + +/* Low bits of TAD offset. */ +static const u32 knl_tad_dram_offset_lo[] = { + 0x404, 0x504, 0x604, 0x704, + 0x804, 0x904, 0xa04, 0xb04, +}; + +/* High 16 bits of TAD limit and offset. */ +static const u32 knl_tad_dram_hi[] = { + 0x408, 0x508, 0x608, 0x708, + 0x808, 0x908, 0xa08, 0xb08, +}; + +/* Number of ways a tad entry is interleaved. */ +static const u32 knl_tad_ways[] = { + 8, 6, 4, 3, 2, 1, +}; + +/* + * Retrieve the n'th Target Address Decode table entry + * from the memory controller's TAD table. + * + * @pvt: driver private data + * @entry: which entry you want to retrieve + * @mc: which memory controller (0 or 1) + * @offset: output tad range offset + * @limit: output address of first byte above tad range + * @ways: output number of interleave ways + * + * The offset value has curious semantics. It's a sort of running total + * of the sizes of all the memory regions that aren't mapped in this + * tad table. + */ +static int knl_get_tad(const struct sbridge_pvt *pvt, + const int entry, + const int mc, + u64 *offset, + u64 *limit, + int *ways) +{ + u32 reg_limit_lo, reg_offset_lo, reg_hi; + struct pci_dev *pci_mc; + int way_id; + + switch (mc) { + case 0: + pci_mc = pvt->knl.pci_mc0; + break; + case 1: + pci_mc = pvt->knl.pci_mc1; + break; + default: + WARN_ON(1); + return -EINVAL; + } + + pci_read_config_dword(pci_mc, + knl_tad_dram_limit_lo[entry], ®_limit_lo); + pci_read_config_dword(pci_mc, + knl_tad_dram_offset_lo[entry], ®_offset_lo); + pci_read_config_dword(pci_mc, + knl_tad_dram_hi[entry], ®_hi); + + /* Is this TAD entry enabled? */ + if (!GET_BITFIELD(reg_limit_lo, 0, 0)) + return -ENODEV; + + way_id = GET_BITFIELD(reg_limit_lo, 3, 5); + + if (way_id < ARRAY_SIZE(knl_tad_ways)) { + *ways = knl_tad_ways[way_id]; + } else { + *ways = 0; + sbridge_printk(KERN_ERR, + "Unexpected value %d in mc_tad_limit_lo wayness field\n", + way_id); + return -ENODEV; + } + + /* + * The least significant 6 bits of base and limit are truncated. + * For limit, we fill the missing bits with 1s. + */ + *offset = ((u64) GET_BITFIELD(reg_offset_lo, 6, 31) << 6) | + ((u64) GET_BITFIELD(reg_hi, 0, 15) << 32); + *limit = ((u64) GET_BITFIELD(reg_limit_lo, 6, 31) << 6) | 63 | + ((u64) GET_BITFIELD(reg_hi, 16, 31) << 32); + + return 0; +} + +/* Determine which memory controller is responsible for a given channel. */ +static int knl_channel_mc(int channel) +{ + WARN_ON(channel < 0 || channel >= 6); + + return channel < 3 ? 1 : 0; +} + +/* + * Get the Nth entry from EDC_ROUTE_TABLE register. + * (This is the per-tile mapping of logical interleave targets to + * physical EDC modules.) + * + * entry 0: 0:2 + * 1: 3:5 + * 2: 6:8 + * 3: 9:11 + * 4: 12:14 + * 5: 15:17 + * 6: 18:20 + * 7: 21:23 + * reserved: 24:31 + */ +static u32 knl_get_edc_route(int entry, u32 reg) +{ + WARN_ON(entry >= KNL_MAX_EDCS); + return GET_BITFIELD(reg, entry*3, (entry*3)+2); +} + +/* + * Get the Nth entry from MC_ROUTE_TABLE register. + * (This is the per-tile mapping of logical interleave targets to + * physical DRAM channels modules.) + * + * entry 0: mc 0:2 channel 18:19 + * 1: mc 3:5 channel 20:21 + * 2: mc 6:8 channel 22:23 + * 3: mc 9:11 channel 24:25 + * 4: mc 12:14 channel 26:27 + * 5: mc 15:17 channel 28:29 + * reserved: 30:31 + * + * Though we have 3 bits to identify the MC, we should only see + * the values 0 or 1. + */ + +static u32 knl_get_mc_route(int entry, u32 reg) +{ + int mc, chan; + + WARN_ON(entry >= KNL_MAX_CHANNELS); + + mc = GET_BITFIELD(reg, entry*3, (entry*3)+2); + chan = GET_BITFIELD(reg, (entry*2) + 18, (entry*2) + 18 + 1); + + return knl_channel_remap(mc*3 + chan); +} + +/* + * Render the EDC_ROUTE register in human-readable form. + * Output string s should be at least KNL_MAX_EDCS*2 bytes. + */ +static void knl_show_edc_route(u32 reg, char *s) +{ + int i; + + for (i = 0; i < KNL_MAX_EDCS; i++) { + s[i*2] = knl_get_edc_route(i, reg) + '0'; + s[i*2+1] = '-'; + } + + s[KNL_MAX_EDCS*2 - 1] = '\0'; +} + +/* + * Render the MC_ROUTE register in human-readable form. + * Output string s should be at least KNL_MAX_CHANNELS*2 bytes. + */ +static void knl_show_mc_route(u32 reg, char *s) +{ + int i; + + for (i = 0; i < KNL_MAX_CHANNELS; i++) { + s[i*2] = knl_get_mc_route(i, reg) + '0'; + s[i*2+1] = '-'; + } + + s[KNL_MAX_CHANNELS*2 - 1] = '\0'; +} + +#define KNL_EDC_ROUTE 0xb8 +#define KNL_MC_ROUTE 0xb4 + +/* Is this dram rule backed by regular DRAM in flat mode? */ +#define KNL_EDRAM(reg) GET_BITFIELD(reg, 29, 29) + +/* Is this dram rule cached? */ +#define KNL_CACHEABLE(reg) GET_BITFIELD(reg, 28, 28) + +/* Is this rule backed by edc ? */ +#define KNL_EDRAM_ONLY(reg) GET_BITFIELD(reg, 29, 29) + +/* Is this rule backed by DRAM, cacheable in EDRAM? */ +#define KNL_CACHEABLE(reg) GET_BITFIELD(reg, 28, 28) + +/* Is this rule mod3? */ +#define KNL_MOD3(reg) GET_BITFIELD(reg, 27, 27) + +/* + * Figure out how big our RAM modules are. + * + * The DIMMMTR register in KNL doesn't tell us the size of the DIMMs, so we + * have to figure this out from the SAD rules, interleave lists, route tables, + * and TAD rules. + * + * SAD rules can have holes in them (e.g. the 3G-4G hole), so we have to + * inspect the TAD rules to figure out how large the SAD regions really are. + * + * When we know the real size of a SAD region and how many ways it's + * interleaved, we know the individual contribution of each channel to + * TAD is size/ways. + * + * Finally, we have to check whether each channel participates in each SAD + * region. + * + * Fortunately, KNL only supports one DIMM per channel, so once we know how + * much memory the channel uses, we know the DIMM is at least that large. + * (The BIOS might possibly choose not to map all available memory, in which + * case we will underreport the size of the DIMM.) + * + * In theory, we could try to determine the EDC sizes as well, but that would + * only work in flat mode, not in cache mode. + * + * @mc_sizes: Output sizes of channels (must have space for KNL_MAX_CHANNELS + * elements) + */ +static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes) +{ + u64 sad_base, sad_size, sad_limit = 0; + u64 tad_base, tad_size, tad_limit, tad_deadspace, tad_livespace; + int sad_rule = 0; + int tad_rule = 0; + int intrlv_ways, tad_ways; + u32 first_pkg, pkg; + int i; + u64 sad_actual_size[2]; /* sad size accounting for holes, per mc */ + u32 dram_rule, interleave_reg; + u32 mc_route_reg[KNL_MAX_CHAS]; + u32 edc_route_reg[KNL_MAX_CHAS]; + int edram_only; + char edc_route_string[KNL_MAX_EDCS*2]; + char mc_route_string[KNL_MAX_CHANNELS*2]; + int cur_reg_start; + int mc; + int channel; + int way; + int participants[KNL_MAX_CHANNELS]; + int participant_count = 0; + + for (i = 0; i < KNL_MAX_CHANNELS; i++) + mc_sizes[i] = 0; + + /* Read the EDC route table in each CHA. */ + cur_reg_start = 0; + for (i = 0; i < KNL_MAX_CHAS; i++) { + pci_read_config_dword(pvt->knl.pci_cha[i], + KNL_EDC_ROUTE, &edc_route_reg[i]); + + if (i > 0 && edc_route_reg[i] != edc_route_reg[i-1]) { + knl_show_edc_route(edc_route_reg[i-1], + edc_route_string); + if (cur_reg_start == i-1) + edac_dbg(0, "edc route table for CHA %d: %s\n", + cur_reg_start, edc_route_string); + else + edac_dbg(0, "edc route table for CHA %d-%d: %s\n", + cur_reg_start, i-1, edc_route_string); + cur_reg_start = i; + } + } + knl_show_edc_route(edc_route_reg[i-1], edc_route_string); + if (cur_reg_start == i-1) + edac_dbg(0, "edc route table for CHA %d: %s\n", + cur_reg_start, edc_route_string); + else + edac_dbg(0, "edc route table for CHA %d-%d: %s\n", + cur_reg_start, i-1, edc_route_string); + + /* Read the MC route table in each CHA. */ + cur_reg_start = 0; + for (i = 0; i < KNL_MAX_CHAS; i++) { + pci_read_config_dword(pvt->knl.pci_cha[i], + KNL_MC_ROUTE, &mc_route_reg[i]); + + if (i > 0 && mc_route_reg[i] != mc_route_reg[i-1]) { + knl_show_mc_route(mc_route_reg[i-1], mc_route_string); + if (cur_reg_start == i-1) + edac_dbg(0, "mc route table for CHA %d: %s\n", + cur_reg_start, mc_route_string); + else + edac_dbg(0, "mc route table for CHA %d-%d: %s\n", + cur_reg_start, i-1, mc_route_string); + cur_reg_start = i; + } + } + knl_show_mc_route(mc_route_reg[i-1], mc_route_string); + if (cur_reg_start == i-1) + edac_dbg(0, "mc route table for CHA %d: %s\n", + cur_reg_start, mc_route_string); + else + edac_dbg(0, "mc route table for CHA %d-%d: %s\n", + cur_reg_start, i-1, mc_route_string); + + /* Process DRAM rules */ + for (sad_rule = 0; sad_rule < pvt->info.max_sad; sad_rule++) { + /* previous limit becomes the new base */ + sad_base = sad_limit; + + pci_read_config_dword(pvt->pci_sad0, + pvt->info.dram_rule[sad_rule], &dram_rule); + + if (!DRAM_RULE_ENABLE(dram_rule)) + break; + + edram_only = KNL_EDRAM_ONLY(dram_rule); + + sad_limit = pvt->info.sad_limit(dram_rule)+1; + sad_size = sad_limit - sad_base; + + pci_read_config_dword(pvt->pci_sad0, + pvt->info.interleave_list[sad_rule], &interleave_reg); + + /* + * Find out how many ways this dram rule is interleaved. + * We stop when we see the first channel again. + */ + first_pkg = sad_pkg(pvt->info.interleave_pkg, + interleave_reg, 0); + for (intrlv_ways = 1; intrlv_ways < 8; intrlv_ways++) { + pkg = sad_pkg(pvt->info.interleave_pkg, + interleave_reg, intrlv_ways); + + if ((pkg & 0x8) == 0) { + /* + * 0 bit means memory is non-local, + * which KNL doesn't support + */ + edac_dbg(0, "Unexpected interleave target %d\n", + pkg); + return -1; + } + + if (pkg == first_pkg) + break; + } + if (KNL_MOD3(dram_rule)) + intrlv_ways *= 3; + + edac_dbg(3, "dram rule %d (base 0x%llx, limit 0x%llx), %d way interleave%s\n", + sad_rule, + sad_base, + sad_limit, + intrlv_ways, + edram_only ? ", EDRAM" : ""); + + /* + * Find out how big the SAD region really is by iterating + * over TAD tables (SAD regions may contain holes). + * Each memory controller might have a different TAD table, so + * we have to look at both. + * + * Livespace is the memory that's mapped in this TAD table, + * deadspace is the holes (this could be the MMIO hole, or it + * could be memory that's mapped by the other TAD table but + * not this one). + */ + for (mc = 0; mc < 2; mc++) { + sad_actual_size[mc] = 0; + tad_livespace = 0; + for (tad_rule = 0; + tad_rule < ARRAY_SIZE( + knl_tad_dram_limit_lo); + tad_rule++) { + if (knl_get_tad(pvt, + tad_rule, + mc, + &tad_deadspace, + &tad_limit, + &tad_ways)) + break; + + tad_size = (tad_limit+1) - + (tad_livespace + tad_deadspace); + tad_livespace += tad_size; + tad_base = (tad_limit+1) - tad_size; + + if (tad_base < sad_base) { + if (tad_limit > sad_base) + edac_dbg(0, "TAD region overlaps lower SAD boundary -- TAD tables may be configured incorrectly.\n"); + } else if (tad_base < sad_limit) { + if (tad_limit+1 > sad_limit) { + edac_dbg(0, "TAD region overlaps upper SAD boundary -- TAD tables may be configured incorrectly.\n"); + } else { + /* TAD region is completely inside SAD region */ + edac_dbg(3, "TAD region %d 0x%llx - 0x%llx (%lld bytes) table%d\n", + tad_rule, tad_base, + tad_limit, tad_size, + mc); + sad_actual_size[mc] += tad_size; + } + } + tad_base = tad_limit+1; + } + } + + for (mc = 0; mc < 2; mc++) { + edac_dbg(3, " total TAD DRAM footprint in table%d : 0x%llx (%lld bytes)\n", + mc, sad_actual_size[mc], sad_actual_size[mc]); + } + + /* Ignore EDRAM rule */ + if (edram_only) + continue; + + /* Figure out which channels participate in interleave. */ + for (channel = 0; channel < KNL_MAX_CHANNELS; channel++) + participants[channel] = 0; + + /* For each channel, does at least one CHA have + * this channel mapped to the given target? + */ + for (channel = 0; channel < KNL_MAX_CHANNELS; channel++) { + for (way = 0; way < intrlv_ways; way++) { + int target; + int cha; + + if (KNL_MOD3(dram_rule)) + target = way; + else + target = 0x7 & sad_pkg( + pvt->info.interleave_pkg, interleave_reg, way); + + for (cha = 0; cha < KNL_MAX_CHAS; cha++) { + if (knl_get_mc_route(target, + mc_route_reg[cha]) == channel + && participants[channel]) { + participant_count++; + participants[channel] = 1; + break; + } + } + } + } + + if (participant_count != intrlv_ways) + edac_dbg(0, "participant_count (%d) != interleave_ways (%d): DIMM size may be incorrect\n", + participant_count, intrlv_ways); + + for (channel = 0; channel < KNL_MAX_CHANNELS; channel++) { + mc = knl_channel_mc(channel); + if (participants[channel]) { + edac_dbg(4, "mc channel %d contributes %lld bytes via sad entry %d\n", + channel, + sad_actual_size[mc]/intrlv_ways, + sad_rule); + mc_sizes[channel] += + sad_actual_size[mc]/intrlv_ways; + } + } + } + + return 0; +} + static int get_dimm_config(struct mem_ctl_info *mci) { struct sbridge_pvt *pvt = mci->pvt_info; @@ -964,13 +1606,20 @@ static int get_dimm_config(struct mem_ctl_info *mci) u32 reg; enum edac_type mode; enum mem_type mtype; + int channels = pvt->info.type == KNIGHTS_LANDING ? + KNL_MAX_CHANNELS : NUM_CHANNELS; + u64 knl_mc_sizes[KNL_MAX_CHANNELS]; - if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL) + if (pvt->info.type == HASWELL || pvt->info.type == BROADWELL || + pvt->info.type == KNIGHTS_LANDING) pci_read_config_dword(pvt->pci_sad1, SAD_TARGET, ®); else pci_read_config_dword(pvt->pci_br0, SAD_TARGET, ®); - pvt->sbridge_dev->source_id = SOURCE_ID(reg); + if (pvt->info.type == KNIGHTS_LANDING) + pvt->sbridge_dev->source_id = SOURCE_ID_KNL(reg); + else + pvt->sbridge_dev->source_id = SOURCE_ID(reg); pvt->sbridge_dev->node_id = pvt->info.get_node_id(pvt); edac_dbg(0, "mc#%d: Node ID: %d, source ID: %d\n", @@ -978,31 +1627,42 @@ static int get_dimm_config(struct mem_ctl_info *mci) pvt->sbridge_dev->node_id, pvt->sbridge_dev->source_id); - pci_read_config_dword(pvt->pci_ras, RASENABLES, ®); - if (IS_MIRROR_ENABLED(reg)) { - edac_dbg(0, "Memory mirror is enabled\n"); - pvt->is_mirrored = true; - } else { - edac_dbg(0, "Memory mirror is disabled\n"); + /* KNL doesn't support mirroring or lockstep, + * and is always closed page + */ + if (pvt->info.type == KNIGHTS_LANDING) { + mode = EDAC_S4ECD4ED; pvt->is_mirrored = false; - } - pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr); - if (IS_LOCKSTEP_ENABLED(pvt->info.mcmtr)) { - edac_dbg(0, "Lockstep is enabled\n"); - mode = EDAC_S8ECD8ED; - pvt->is_lockstep = true; + if (knl_get_dimm_capacity(pvt, knl_mc_sizes) != 0) + return -1; } else { - edac_dbg(0, "Lockstep is disabled\n"); - mode = EDAC_S4ECD4ED; - pvt->is_lockstep = false; - } - if (IS_CLOSE_PG(pvt->info.mcmtr)) { - edac_dbg(0, "address map is on closed page mode\n"); - pvt->is_close_pg = true; - } else { - edac_dbg(0, "address map is on open page mode\n"); - pvt->is_close_pg = false; + pci_read_config_dword(pvt->pci_ras, RASENABLES, ®); + if (IS_MIRROR_ENABLED(reg)) { + edac_dbg(0, "Memory mirror is enabled\n"); + pvt->is_mirrored = true; + } else { + edac_dbg(0, "Memory mirror is disabled\n"); + pvt->is_mirrored = false; + } + + pci_read_config_dword(pvt->pci_ta, MCMTR, &pvt->info.mcmtr); + if (IS_LOCKSTEP_ENABLED(pvt->info.mcmtr)) { + edac_dbg(0, "Lockstep is enabled\n"); + mode = EDAC_S8ECD8ED; + pvt->is_lockstep = true; + } else { + edac_dbg(0, "Lockstep is disabled\n"); + mode = EDAC_S4ECD4ED; + pvt->is_lockstep = false; + } + if (IS_CLOSE_PG(pvt->info.mcmtr)) { + edac_dbg(0, "address map is on closed page mode\n"); + pvt->is_close_pg = true; + } else { + edac_dbg(0, "address map is on open page mode\n"); + pvt->is_close_pg = false; + } } mtype = pvt->info.get_memory_type(pvt); @@ -1018,23 +1678,46 @@ static int get_dimm_config(struct mem_ctl_info *mci) else banks = 8; - for (i = 0; i < NUM_CHANNELS; i++) { + for (i = 0; i < channels; i++) { u32 mtr; - if (!pvt->pci_tad[i]) - continue; - for (j = 0; j < ARRAY_SIZE(mtr_regs); j++) { + int max_dimms_per_channel; + + if (pvt->info.type == KNIGHTS_LANDING) { + max_dimms_per_channel = 1; + if (!pvt->knl.pci_channel[i]) + continue; + } else { + max_dimms_per_channel = ARRAY_SIZE(mtr_regs); + if (!pvt->pci_tad[i]) + continue; + } + + for (j = 0; j < max_dimms_per_channel; j++) { dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, mci->n_layers, i, j, 0); - pci_read_config_dword(pvt->pci_tad[i], - mtr_regs[j], &mtr); + if (pvt->info.type == KNIGHTS_LANDING) { + pci_read_config_dword(pvt->knl.pci_channel[i], + knl_mtr_reg, &mtr); + } else { + pci_read_config_dword(pvt->pci_tad[i], + mtr_regs[j], &mtr); + } edac_dbg(4, "Channel #%d MTR%d = %x\n", i, j, mtr); if (IS_DIMM_PRESENT(mtr)) { pvt->channel[i].dimms++; ranks = numrank(pvt->info.type, mtr); - rows = numrow(mtr); - cols = numcol(mtr); + + if (pvt->info.type == KNIGHTS_LANDING) { + /* For DDR4, this is fixed. */ + cols = 1 << 10; + rows = knl_mc_sizes[i] / + ((u64) cols * ranks * banks * 8); + } else { + rows = numrow(mtr); + cols = numcol(mtr); + } size = ((u64)rows * cols * banks * ranks) >> (20 - 3); npages = MiB_TO_PAGES(size); @@ -1131,6 +1814,9 @@ static void get_memory_layout(const struct mem_ctl_info *mci) } } + if (pvt->info.type == KNIGHTS_LANDING) + return; + /* * Step 3) Get TAD range */ @@ -1727,6 +2413,8 @@ static int sbridge_get_all_devices_full(u8 *num_mc, #define sbridge_get_all_devices(num_mc, table) \ sbridge_get_all_devices_full(num_mc, table, 0, 0) +#define sbridge_get_all_devices_knl(num_mc, table) \ + sbridge_get_all_devices_full(num_mc, table, 1, 1) static int sbridge_mci_bind_devs(struct mem_ctl_info *mci, struct sbridge_dev *sbridge_dev) @@ -2083,6 +2771,131 @@ enodev: return -ENODEV; } +static int knl_mci_bind_devs(struct mem_ctl_info *mci, + struct sbridge_dev *sbridge_dev) +{ + struct sbridge_pvt *pvt = mci->pvt_info; + struct pci_dev *pdev; + int dev, func; + + int i; + int devidx; + + for (i = 0; i < sbridge_dev->n_devs; i++) { + pdev = sbridge_dev->pdev[i]; + if (!pdev) + continue; + + /* Extract PCI device and function. */ + dev = (pdev->devfn >> 3) & 0x1f; + func = pdev->devfn & 0x7; + + switch (pdev->device) { + case PCI_DEVICE_ID_INTEL_KNL_IMC_MC: + if (dev == 8) + pvt->knl.pci_mc0 = pdev; + else if (dev == 9) + pvt->knl.pci_mc1 = pdev; + else { + sbridge_printk(KERN_ERR, + "Memory controller in unexpected place! (dev %d, fn %d)\n", + dev, func); + continue; + } + break; + + case PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0: + pvt->pci_sad0 = pdev; + break; + + case PCI_DEVICE_ID_INTEL_KNL_IMC_SAD1: + pvt->pci_sad1 = pdev; + break; + + case PCI_DEVICE_ID_INTEL_KNL_IMC_CHA: + /* There are one of these per tile, and range from + * 1.14.0 to 1.18.5. + */ + devidx = ((dev-14)*8)+func; + + if (devidx < 0 || devidx >= KNL_MAX_CHAS) { + sbridge_printk(KERN_ERR, + "Caching and Home Agent in unexpected place! (dev %d, fn %d)\n", + dev, func); + continue; + } + + WARN_ON(pvt->knl.pci_cha[devidx] != NULL); + + pvt->knl.pci_cha[devidx] = pdev; + break; + + case PCI_DEVICE_ID_INTEL_KNL_IMC_CHANNEL: + devidx = -1; + + /* + * MC0 channels 0-2 are device 9 function 2-4, + * MC1 channels 3-5 are device 8 function 2-4. + */ + + if (dev == 9) + devidx = func-2; + else if (dev == 8) + devidx = 3 + (func-2); + + if (devidx < 0 || devidx >= KNL_MAX_CHANNELS) { + sbridge_printk(KERN_ERR, + "DRAM Channel Registers in unexpected place! (dev %d, fn %d)\n", + dev, func); + continue; + } + + WARN_ON(pvt->knl.pci_channel[devidx] != NULL); + pvt->knl.pci_channel[devidx] = pdev; + break; + + case PCI_DEVICE_ID_INTEL_KNL_IMC_TOLHM: + pvt->knl.pci_mc_info = pdev; + break; + + case PCI_DEVICE_ID_INTEL_KNL_IMC_TA: + pvt->pci_ta = pdev; + break; + + default: + sbridge_printk(KERN_ERR, "Unexpected device %d\n", + pdev->device); + break; + } + } + + if (!pvt->knl.pci_mc0 || !pvt->knl.pci_mc1 || + !pvt->pci_sad0 || !pvt->pci_sad1 || + !pvt->pci_ta) { + goto enodev; + } + + for (i = 0; i < KNL_MAX_CHANNELS; i++) { + if (!pvt->knl.pci_channel[i]) { + sbridge_printk(KERN_ERR, "Missing channel %d\n", i); + goto enodev; + } + } + + for (i = 0; i < KNL_MAX_CHAS; i++) { + if (!pvt->knl.pci_cha[i]) { + sbridge_printk(KERN_ERR, "Missing CHA %d\n", i); + goto enodev; + } + } + + return 0; + +enodev: + sbridge_printk(KERN_ERR, "Some needed devices are missing\n"); + return -ENODEV; +} + /**************************************************************************** Error check routines ****************************************************************************/ @@ -2172,8 +2985,36 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, if (!GET_BITFIELD(m->status, 58, 58)) return; - rc = get_memory_error_data(mci, m->addr, &socket, &ha, - &channel_mask, &rank, &area_type, msg); + if (pvt->info.type == KNIGHTS_LANDING) { + if (channel == 14) { + edac_dbg(0, "%s%s err_code:%04x:%04x EDRAM bank %d\n", + overflow ? " OVERFLOW" : "", + (uncorrected_error && recoverable) + ? " recoverable" : "", + mscod, errcode, + m->bank); + } else { + char A = *("A"); + + channel = knl_channel_remap(channel); + channel_mask = 1 << channel; + snprintf(msg, sizeof(msg), + "%s%s err_code:%04x:%04x channel:%d (DIMM_%c)", + overflow ? " OVERFLOW" : "", + (uncorrected_error && recoverable) + ? " recoverable" : " ", + mscod, errcode, channel, A + channel); + edac_mc_handle_error(tp_event, mci, core_err_cnt, + m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0, + channel, 0, -1, + optype, msg); + } + return; + } else { + rc = get_memory_error_data(mci, m->addr, &socket, &ha, + &channel_mask, &rank, &area_type, msg); + } + if (rc < 0) goto err_parsing; new_mci = get_mci_for_node_id(socket); @@ -2404,10 +3245,11 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) /* allocate a new MC control structure */ layers[0].type = EDAC_MC_LAYER_CHANNEL; - layers[0].size = NUM_CHANNELS; + layers[0].size = type == KNIGHTS_LANDING ? + KNL_MAX_CHANNELS : NUM_CHANNELS; layers[0].is_virt_csrow = false; layers[1].type = EDAC_MC_LAYER_SLOT; - layers[1].size = MAX_DIMMS; + layers[1].size = type == KNIGHTS_LANDING ? 1 : MAX_DIMMS; layers[1].is_virt_csrow = true; mci = edac_mc_alloc(sbridge_dev->mc, ARRAY_SIZE(layers), layers, sizeof(*pvt)); @@ -2425,7 +3267,8 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->sbridge_dev = sbridge_dev; sbridge_dev->mci = mci; - mci->mtype_cap = MEM_FLAG_DDR3; + mci->mtype_cap = type == KNIGHTS_LANDING ? + MEM_FLAG_DDR4 : MEM_FLAG_DDR3; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; mci->mod_name = "sbridge_edac.c"; @@ -2534,6 +3377,30 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) if (unlikely(rc < 0)) goto fail0; break; + case KNIGHTS_LANDING: + /* pvt->info.rankcfgr == ??? */ + pvt->info.get_tolm = knl_get_tolm; + pvt->info.get_tohm = knl_get_tohm; + pvt->info.dram_rule = knl_dram_rule; + pvt->info.get_memory_type = knl_get_memory_type; + pvt->info.get_node_id = knl_get_node_id; + pvt->info.rir_limit = NULL; + pvt->info.sad_limit = knl_sad_limit; + pvt->info.interleave_mode = knl_interleave_mode; + pvt->info.show_interleave_mode = knl_show_interleave_mode; + pvt->info.dram_attr = dram_attr_knl; + pvt->info.max_sad = ARRAY_SIZE(knl_dram_rule); + pvt->info.interleave_list = knl_interleave_list; + pvt->info.max_interleave = ARRAY_SIZE(knl_interleave_list); + pvt->info.interleave_pkg = ibridge_interleave_pkg; + pvt->info.get_width = ibridge_get_width; + mci->ctl_name = kasprintf(GFP_KERNEL, + "Knights Landing Socket#%d", mci->mc_idx); + + rc = knl_mci_bind_devs(mci, sbridge_dev); + if (unlikely(rc < 0)) + goto fail0; + break; } /* Get dimm basic config and the memory layout */ @@ -2588,20 +3455,29 @@ static int sbridge_probe(struct pci_dev *pdev, const struct pci_device_id *id) switch (pdev->device) { case PCI_DEVICE_ID_INTEL_IBRIDGE_IMC_HA0_TA: - rc = sbridge_get_all_devices(&num_mc, pci_dev_descr_ibridge_table); + rc = sbridge_get_all_devices(&num_mc, + pci_dev_descr_ibridge_table); type = IVY_BRIDGE; break; case PCI_DEVICE_ID_INTEL_SBRIDGE_IMC_HA0: - rc = sbridge_get_all_devices(&num_mc, pci_dev_descr_sbridge_table); + rc = sbridge_get_all_devices(&num_mc, + pci_dev_descr_sbridge_table); type = SANDY_BRIDGE; break; case PCI_DEVICE_ID_INTEL_HASWELL_IMC_HA0: - rc = sbridge_get_all_devices(&num_mc, pci_dev_descr_haswell_table); + rc = sbridge_get_all_devices(&num_mc, + pci_dev_descr_haswell_table); type = HASWELL; break; case PCI_DEVICE_ID_INTEL_BROADWELL_IMC_HA0: - rc = sbridge_get_all_devices(&num_mc, pci_dev_descr_broadwell_table); + rc = sbridge_get_all_devices(&num_mc, + pci_dev_descr_broadwell_table); type = BROADWELL; + break; + case PCI_DEVICE_ID_INTEL_KNL_IMC_SAD0: + rc = sbridge_get_all_devices_knl(&num_mc, + pci_dev_descr_knl_table); + type = KNIGHTS_LANDING; break; } if (unlikely(rc < 0)) { -- cgit v1.2.3-58-ga151 From 666db563d3d9fffcfc019e3d1a980dac47601a71 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 10 Dec 2015 13:07:12 -0600 Subject: EDAC, mpc85xx: Make mpc85xx-pci-edac a platform device Originally the mpc85xx-pci-edac driver bound directly to the PCI controller node. Commit 905e75c46dba ("powerpc/fsl-pci: Unify pci/pcie initialization code") turned the PCI controller code into a platform device. Since we can't have two drivers binding to the same device, the EDAC code was changed to be called into as a library-style submodule. However, this doesn't work if the EDAC driver is built as a module. Commit 8d8fcba6d1ea ("EDAC: Rip out the edac_subsys reference counting") exposed another problem with this approach -- mpc85xx_pci_err_probe() was being called in the same early boot phase that the PCI controller is initialized, rather than in the device_initcall phase that the EDAC layer expects. This caused a crash on boot. To fix this, the PCI controller code now creates a child platform device specifically for EDAC, which the mpc85xx-pci-edac driver binds to. Reported-by: Michael Ellerman Reviewed-by: Johannes Thumshirn Signed-off-by: Scott Wood Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Daniel Axtens Cc: Doug Thompson Cc: Jia Hongtao Cc: Jiri Kosina Cc: Kim Phillips Cc: linux-edac Cc: linuxppc-dev@lists.ozlabs.org Cc: Masanari Iida Cc: Mauro Carvalho Chehab Cc: Paul Mackerras Cc: Randy Dunlap Cc: Rob Herring Link: http://lkml.kernel.org/r/1449774432-18593-1-git-send-email-scottwood@freescale.com Signed-off-by: Borislav Petkov --- arch/powerpc/sysdev/fsl_pci.c | 28 +++++++++++++++++++++++++++- arch/powerpc/sysdev/fsl_pci.h | 9 --------- drivers/edac/mpc85xx_edac.c | 38 +++++++++++++++++++++++++++++++++----- include/linux/fsl/edac.h | 8 ++++++++ 4 files changed, 68 insertions(+), 15 deletions(-) create mode 100644 include/linux/fsl/edac.h (limited to 'drivers/edac') diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 610f472f91d1..a1ac80b3041a 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -21,10 +21,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -1255,6 +1257,25 @@ void fsl_pcibios_fixup_phb(struct pci_controller *phb) #endif } +static int add_err_dev(struct platform_device *pdev) +{ + struct platform_device *errdev; + struct mpc85xx_edac_pci_plat_data pd = { + .of_node = pdev->dev.of_node + }; + + errdev = platform_device_register_resndata(&pdev->dev, + "mpc85xx-pci-edac", + PLATFORM_DEVID_AUTO, + pdev->resource, + pdev->num_resources, + &pd, sizeof(pd)); + if (IS_ERR(errdev)) + return PTR_ERR(errdev); + + return 0; +} + static int fsl_pci_probe(struct platform_device *pdev) { struct device_node *node; @@ -1262,8 +1283,13 @@ static int fsl_pci_probe(struct platform_device *pdev) node = pdev->dev.of_node; ret = fsl_add_bridge(pdev, fsl_pci_primary == node); + if (ret) + return ret; - mpc85xx_pci_err_probe(pdev); + ret = add_err_dev(pdev); + if (ret) + dev_err(&pdev->dev, "couldn't register error device: %d\n", + ret); return 0; } diff --git a/arch/powerpc/sysdev/fsl_pci.h b/arch/powerpc/sysdev/fsl_pci.h index c1cec771d5ea..151588530b06 100644 --- a/arch/powerpc/sysdev/fsl_pci.h +++ b/arch/powerpc/sysdev/fsl_pci.h @@ -130,15 +130,6 @@ void fsl_pci_assign_primary(void); static inline void fsl_pci_assign_primary(void) {} #endif -#ifdef CONFIG_EDAC_MPC85XX -int mpc85xx_pci_err_probe(struct platform_device *op); -#else -static inline int mpc85xx_pci_err_probe(struct platform_device *op) -{ - return -ENOTSUPP; -} -#endif - #ifdef CONFIG_FSL_PCI extern int fsl_pci_mcheck_exception(struct pt_regs *); #else diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 3eab06351089..b7139c160baf 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -238,10 +239,12 @@ static irqreturn_t mpc85xx_pci_isr(int irq, void *dev_id) return IRQ_HANDLED; } -int mpc85xx_pci_err_probe(struct platform_device *op) +static int mpc85xx_pci_err_probe(struct platform_device *op) { struct edac_pci_ctl_info *pci; struct mpc85xx_pci_pdata *pdata; + struct mpc85xx_edac_pci_plat_data *plat_data; + struct device_node *of_node; struct resource r; int res = 0; @@ -266,7 +269,15 @@ int mpc85xx_pci_err_probe(struct platform_device *op) pdata->name = "mpc85xx_pci_err"; pdata->irq = NO_IRQ; - if (mpc85xx_pcie_find_capability(op->dev.of_node) > 0) + plat_data = op->dev.platform_data; + if (!plat_data) { + dev_err(&op->dev, "no platform data"); + res = -ENXIO; + goto err; + } + of_node = plat_data->of_node; + + if (mpc85xx_pcie_find_capability(of_node) > 0) pdata->is_pcie = true; dev_set_drvdata(&op->dev, pci); @@ -284,7 +295,7 @@ int mpc85xx_pci_err_probe(struct platform_device *op) pdata->edac_idx = edac_pci_idx++; - res = of_address_to_resource(op->dev.of_node, 0, &r); + res = of_address_to_resource(of_node, 0, &r); if (res) { printk(KERN_ERR "%s: Unable to get resource for " "PCI err regs\n", __func__); @@ -339,7 +350,7 @@ int mpc85xx_pci_err_probe(struct platform_device *op) } if (edac_op_state == EDAC_OPSTATE_INT) { - pdata->irq = irq_of_parse_and_map(op->dev.of_node, 0); + pdata->irq = irq_of_parse_and_map(of_node, 0); res = devm_request_irq(&op->dev, pdata->irq, mpc85xx_pci_isr, IRQF_SHARED, @@ -386,8 +397,22 @@ err: devres_release_group(&op->dev, mpc85xx_pci_err_probe); return res; } -EXPORT_SYMBOL(mpc85xx_pci_err_probe); +static const struct platform_device_id mpc85xx_pci_err_match[] = { + { + .name = "mpc85xx-pci-edac" + }, + {} +}; + +static struct platform_driver mpc85xx_pci_err_driver = { + .probe = mpc85xx_pci_err_probe, + .id_table = mpc85xx_pci_err_match, + .driver = { + .name = "mpc85xx_pci_err", + .suppress_bind_attrs = true, + }, +}; #endif /* CONFIG_PCI */ /**************************** L2 Err device ***************************/ @@ -1211,6 +1236,9 @@ static void __init mpc85xx_mc_clear_rfxe(void *data) static struct platform_driver * const drivers[] = { &mpc85xx_mc_err_driver, &mpc85xx_l2_err_driver, +#ifdef CONFIG_PCI + &mpc85xx_pci_err_driver, +#endif }; static int __init mpc85xx_mc_init(void) diff --git a/include/linux/fsl/edac.h b/include/linux/fsl/edac.h new file mode 100644 index 000000000000..90d64d4ec1a9 --- /dev/null +++ b/include/linux/fsl/edac.h @@ -0,0 +1,8 @@ +#ifndef FSL_EDAC_H +#define FSL_EDAC_H + +struct mpc85xx_edac_pci_plat_data { + struct device_node *of_node; +}; + +#endif -- cgit v1.2.3-58-ga151 From 12e26969b32c79018165d52caff3762135614aa1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 1 Dec 2015 15:52:36 +0100 Subject: EDAC, mc_sysfs: Fix freeing bus' name I get the splat below when modprobing/rmmoding EDAC drivers. It happens because bus->name is invalid after bus_unregister() has run. The Code: section below corresponds to: .loc 1 1108 0 movq 672(%rbx), %rax # mci_1(D)->bus, mci_1(D)->bus .loc 1 1109 0 popq %rbx # .loc 1 1108 0 movq (%rax), %rdi # _7->name, jmp kfree # and %rax has some funky stuff 2030203020312030 which looks a lot like something walked over it. Fix that by saving the name ptr before doing stuff to string it points to. general protection fault: 0000 [#1] SMP Modules linked in: ... CPU: 4 PID: 10318 Comm: modprobe Tainted: G I EN 3.12.51-11-default+ #48 Hardware name: HP ProLiant DL380 G7, BIOS P67 05/05/2011 task: ffff880311320280 ti: ffff88030da3e000 task.ti: ffff88030da3e000 RIP: 0010:[] [] edac_unregister_sysfs+0x22/0x30 [edac_core] RSP: 0018:ffff88030da3fe28 EFLAGS: 00010292 RAX: 2030203020312030 RBX: ffff880311b4e000 RCX: 000000000000095c RDX: 0000000000000001 RSI: ffff880327bb9600 RDI: 0000000000000286 RBP: ffff880311b4e750 R08: 0000000000000000 R09: ffffffff81296110 R10: 0000000000000400 R11: 0000000000000000 R12: ffff88030ba1ac68 R13: 0000000000000001 R14: 00000000011b02f0 R15: 0000000000000000 FS: 00007fc9bf8f5700(0000) GS:ffff8801a7c40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000403c90 CR3: 000000019ebdf000 CR4: 00000000000007e0 Stack: Call Trace: i7core_unregister_mci.isra.9 i7core_remove pci_device_remove __device_release_driver driver_detach bus_remove_driver pci_unregister_driver i7core_exit SyS_delete_module system_call_fastpath 0x7fc9bf426536 Code: 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 53 48 89 fb e8 52 2a 1f e1 48 8b bb a0 02 00 00 e8 46 59 1f e1 48 8b 83 a0 02 00 00 5b <48> 8b 38 e9 26 9a fe e0 66 0f 1f 44 00 00 66 66 66 66 90 48 8b RIP [] edac_unregister_sysfs+0x22/0x30 [edac_core] RSP Signed-off-by: Borislav Petkov Cc: Mauro Carvalho Chehab Cc: # v3.6.. Fixes: 7a623c039075 ("edac: rewrite the sysfs code to use struct device") --- drivers/edac/edac_mc_sysfs.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index a75acea0f674..58aed67b7eba 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -880,21 +880,26 @@ static struct device_type mci_attr_type = { int edac_create_sysfs_mci_device(struct mem_ctl_info *mci, const struct attribute_group **groups) { + char *name; int i, err; /* * The memory controller needs its own bus, in order to avoid * namespace conflicts at /sys/bus/edac. */ - mci->bus->name = kasprintf(GFP_KERNEL, "mc%d", mci->mc_idx); - if (!mci->bus->name) + name = kasprintf(GFP_KERNEL, "mc%d", mci->mc_idx); + if (!name) return -ENOMEM; + mci->bus->name = name; + edac_dbg(0, "creating bus %s\n", mci->bus->name); err = bus_register(mci->bus); - if (err < 0) - goto fail_free_name; + if (err < 0) { + kfree(name); + return err; + } /* get the /sys/devices/system/edac subsys reference */ mci->dev.type = &mci_attr_type; @@ -961,8 +966,8 @@ fail_unregister_dimm: device_unregister(&mci->dev); fail_unregister_bus: bus_unregister(mci->bus); -fail_free_name: - kfree(mci->bus->name); + kfree(name); + return err; } @@ -993,10 +998,12 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) void edac_unregister_sysfs(struct mem_ctl_info *mci) { + const char *name = mci->bus->name; + edac_dbg(1, "Unregistering device %s\n", dev_name(&mci->dev)); device_unregister(&mci->dev); bus_unregister(mci->bus); - kfree(mci->bus->name); + kfree(name); } static void mc_attr_release(struct device *dev) -- cgit v1.2.3-58-ga151 From fcd5c4dd8201595d4c598c9cca5e54760277d687 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 27 Nov 2015 10:38:38 +0100 Subject: EDAC: Robustify workqueues destruction EDAC workqueue destruction is really fragile. We cancel delayed work but if it is still running and requeues itself, we still go ahead and destroy the workqueue and the queued work explodes when workqueue core attempts to run it. Make the destruction more robust by switching op_state to offline so that requeuing stops. Cancel any pending work *synchronously* too. EDAC i7core: Driver loaded. general protection fault: 0000 [#1] SMP CPU 12 Modules linked in: Supported: Yes Pid: 0, comm: kworker/0:1 Tainted: G IE 3.0.101-0-default #1 HP ProLiant DL380 G7 RIP: 0010:[] [] __queue_work+0x17/0x3f0 < ... regs ...> Process kworker/0:1 (pid: 0, threadinfo ffff88019def6000, task ffff88019def4600) Stack: ... Call Trace: call_timer_fn run_timer_softirq __do_softirq call_softirq do_softirq irq_exit smp_apic_timer_interrupt apic_timer_interrupt intel_idle cpuidle_idle_call cpu_idle Code: ... RIP __queue_work RSP <...> Signed-off-by: Borislav Petkov Cc: --- drivers/edac/edac_device.c | 11 ++++------- drivers/edac/edac_mc.c | 14 +++----------- drivers/edac/edac_pci.c | 9 ++++----- 3 files changed, 11 insertions(+), 23 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 592af5f0cf39..53587377e672 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -435,16 +435,13 @@ void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev, */ void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev) { - int status; - if (!edac_dev->edac_check) return; - status = cancel_delayed_work(&edac_dev->work); - if (status == 0) { - /* workq instance might be running, wait for it */ - flush_workqueue(edac_workqueue); - } + edac_dev->op_state = OP_OFFLINE; + + cancel_delayed_work_sync(&edac_dev->work); + flush_workqueue(edac_workqueue); } /* diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 77ecd6a4179a..1b2c2187b347 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -586,18 +586,10 @@ static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec, */ static void edac_mc_workq_teardown(struct mem_ctl_info *mci) { - int status; - - if (mci->op_state != OP_RUNNING_POLL) - return; - - status = cancel_delayed_work(&mci->work); - if (status == 0) { - edac_dbg(0, "not canceled, flush the queue\n"); + mci->op_state = OP_OFFLINE; - /* workq instance might be running, wait for it */ - flush_workqueue(edac_workqueue); - } + cancel_delayed_work_sync(&mci->work); + flush_workqueue(edac_workqueue); } /* diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index 5034385c47e6..d8b083190695 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -239,13 +239,12 @@ static void edac_pci_workq_setup(struct edac_pci_ctl_info *pci, */ static void edac_pci_workq_teardown(struct edac_pci_ctl_info *pci) { - int status; - edac_dbg(0, "\n"); - status = cancel_delayed_work(&pci->work); - if (status == 0) - flush_workqueue(edac_workqueue); + pci->op_state = OP_OFFLINE; + + cancel_delayed_work_sync(&pci->work); + flush_workqueue(edac_workqueue); } /* -- cgit v1.2.3-58-ga151 From 733476cf207faf574b132523ff2aee78b488ed6b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 27 Nov 2015 11:40:43 +0100 Subject: EDAC: Rip out the edac_subsys reference counting This was really dumb - reference counting for the main EDAC sysfs object. While we could've simply registered it as the first thing in the module init path and then hand it around to what needs it. Do that and rip out all the code around it, thus simplifying the whole handling significantly. Move the edac_subsys node back to edac_module.c. Signed-off-by: Borislav Petkov --- drivers/edac/edac_device_sysfs.c | 6 +----- drivers/edac/edac_mc_sysfs.c | 5 +---- drivers/edac/edac_module.c | 40 +++++++++++++++++++++++++++++++++++++++ drivers/edac/edac_pci_sysfs.c | 6 +----- drivers/edac/edac_stub.c | 41 ---------------------------------------- include/linux/edac.h | 1 - 6 files changed, 43 insertions(+), 56 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c index fb68a06ad683..170527165374 100644 --- a/drivers/edac/edac_device_sysfs.c +++ b/drivers/edac/edac_device_sysfs.c @@ -256,7 +256,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev) if (!try_module_get(edac_dev->owner)) { err = -ENODEV; - goto err_mod_get; + goto err_out; } /* register */ @@ -282,9 +282,6 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev) err_kobj_reg: module_put(edac_dev->owner); -err_mod_get: - edac_put_sysfs_subsys(); - err_out: return err; } @@ -306,7 +303,6 @@ void edac_device_unregister_sysfs_main_kobj(struct edac_device_ctl_info *dev) * b) 'kfree' the memory */ kobject_put(&dev->kobj); - edac_put_sysfs_subsys(); } /* edac_dev -> instance information */ diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 58aed67b7eba..1c79ae3e083a 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -1039,7 +1039,7 @@ int __init edac_mc_sysfs_init(void) mci_pdev = kzalloc(sizeof(*mci_pdev), GFP_KERNEL); if (!mci_pdev) { err = -ENOMEM; - goto out_put_sysfs; + goto out; } mci_pdev->bus = edac_subsys; @@ -1057,8 +1057,6 @@ int __init edac_mc_sysfs_init(void) out_dev_free: kfree(mci_pdev); - out_put_sysfs: - edac_put_sysfs_subsys(); out: return err; } @@ -1066,5 +1064,4 @@ int __init edac_mc_sysfs_init(void) void edac_mc_sysfs_exit(void) { device_unregister(mci_pdev); - edac_put_sysfs_subsys(); } diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c index 9cb082a19d8a..059b5924988b 100644 --- a/drivers/edac/edac_module.c +++ b/drivers/edac/edac_module.c @@ -91,6 +91,39 @@ static void edac_workqueue_teardown(void) } } +/* + * sysfs object: /sys/devices/system/edac + * need to export to other files + */ +struct bus_type edac_subsys = { + .name = "edac", + .dev_name = "edac", +}; +EXPORT_SYMBOL_GPL(edac_subsys); + +static int edac_subsys_init(void) +{ + int err; + + /* create the /sys/devices/system/edac directory */ + err = subsys_system_register(&edac_subsys, NULL); + if (err) + printk(KERN_ERR "Error registering toplevel EDAC sysfs dir\n"); + + return err; +} + +static void edac_subsys_exit(void) +{ + bus_unregister(&edac_subsys); +} + +/* return pointer to the 'edac' node in sysfs */ +struct bus_type *edac_get_sysfs_subsys(void) +{ + return &edac_subsys; +} +EXPORT_SYMBOL_GPL(edac_get_sysfs_subsys); /* * edac_init * module initialization entry point @@ -101,6 +134,10 @@ static int __init edac_init(void) edac_printk(KERN_INFO, EDAC_MC, EDAC_VERSION "\n"); + err = edac_subsys_init(); + if (err) + return err; + /* * Harvest and clear any boot/initialization PCI parity errors * @@ -129,6 +166,8 @@ err_wq: edac_mc_sysfs_exit(); err_sysfs: + edac_subsys_exit(); + return err; } @@ -144,6 +183,7 @@ static void __exit edac_exit(void) edac_workqueue_teardown(); edac_mc_sysfs_exit(); edac_debugfs_exit(); + edac_subsys_exit(); } /* diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index 24d877f6e577..262f56cca9ff 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -364,7 +364,7 @@ static int edac_pci_main_kobj_setup(void) if (!try_module_get(THIS_MODULE)) { edac_dbg(1, "try_module_get() failed\n"); err = -ENODEV; - goto mod_get_fail; + goto decrement_count_fail; } edac_pci_top_main_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); @@ -399,9 +399,6 @@ kobject_init_and_add_fail: kzalloc_fail: module_put(THIS_MODULE); -mod_get_fail: - edac_put_sysfs_subsys(); - decrement_count_fail: /* if are on this error exit, nothing to tear down */ atomic_dec(&edac_pci_sysfs_refcount); @@ -426,7 +423,6 @@ static void edac_pci_main_kobj_teardown(void) if (atomic_dec_return(&edac_pci_sysfs_refcount) == 0) { edac_dbg(0, "called kobject_put on main kobj\n"); kobject_put(edac_pci_top_main_kobj); - edac_put_sysfs_subsys(); } } diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c index ff07aae5b7fb..952e411f01f2 100644 --- a/drivers/edac/edac_stub.c +++ b/drivers/edac/edac_stub.c @@ -26,8 +26,6 @@ EXPORT_SYMBOL_GPL(edac_handlers); int edac_err_assert = 0; EXPORT_SYMBOL_GPL(edac_err_assert); -static atomic_t edac_subsys_valid = ATOMIC_INIT(0); - int edac_report_status = EDAC_REPORTING_ENABLED; EXPORT_SYMBOL_GPL(edac_report_status); @@ -68,42 +66,3 @@ void edac_atomic_assert_error(void) edac_err_assert++; } EXPORT_SYMBOL_GPL(edac_atomic_assert_error); - -/* - * sysfs object: /sys/devices/system/edac - * need to export to other files - */ -struct bus_type edac_subsys = { - .name = "edac", - .dev_name = "edac", -}; -EXPORT_SYMBOL_GPL(edac_subsys); - -/* return pointer to the 'edac' node in sysfs */ -struct bus_type *edac_get_sysfs_subsys(void) -{ - int err = 0; - - if (atomic_read(&edac_subsys_valid)) - goto out; - - /* create the /sys/devices/system/edac directory */ - err = subsys_system_register(&edac_subsys, NULL); - if (err) { - printk(KERN_ERR "Error registering toplevel EDAC sysfs dir\n"); - return NULL; - } - -out: - atomic_inc(&edac_subsys_valid); - return &edac_subsys; -} -EXPORT_SYMBOL_GPL(edac_get_sysfs_subsys); - -void edac_put_sysfs_subsys(void) -{ - /* last user unregisters it */ - if (atomic_dec_and_test(&edac_subsys_valid)) - bus_unregister(&edac_subsys); -} -EXPORT_SYMBOL_GPL(edac_put_sysfs_subsys); diff --git a/include/linux/edac.h b/include/linux/edac.h index da6964873dcf..98f915dfeeac 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -33,7 +33,6 @@ extern struct bus_type edac_subsys; extern int edac_handler_set(void); extern void edac_atomic_assert_error(void); extern struct bus_type *edac_get_sysfs_subsys(void); -extern void edac_put_sysfs_subsys(void); enum { EDAC_REPORTING_ENABLED, -- cgit v1.2.3-58-ga151 From a97d26270169dc30ef28f0860097b7dc793206be Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 Nov 2015 14:15:31 +0100 Subject: EDAC: Unexport and make edac_subsys static ... and use the accessor instead. Signed-off-by: Borislav Petkov --- drivers/edac/edac_module.c | 3 +-- include/linux/edac.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c index 059b5924988b..2b53680a687d 100644 --- a/drivers/edac/edac_module.c +++ b/drivers/edac/edac_module.c @@ -95,11 +95,10 @@ static void edac_workqueue_teardown(void) * sysfs object: /sys/devices/system/edac * need to export to other files */ -struct bus_type edac_subsys = { +static struct bus_type edac_subsys = { .name = "edac", .dev_name = "edac", }; -EXPORT_SYMBOL_GPL(edac_subsys); static int edac_subsys_init(void) { diff --git a/include/linux/edac.h b/include/linux/edac.h index 98f915dfeeac..9e0d78966552 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -28,7 +28,6 @@ struct device; extern int edac_op_state; extern int edac_err_assert; extern atomic_t edac_handlers; -extern struct bus_type edac_subsys; extern int edac_handler_set(void); extern void edac_atomic_assert_error(void); -- cgit v1.2.3-58-ga151 From d4538000ca4354a2c07cdd31ce994af7d23e24d6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 Nov 2015 14:20:41 +0100 Subject: EDAC: Remove edac_get_sysfs_subsys() error handling It cannot fail now. We either load EDAC core after having successfully initialized edac_subsys or we don't. Signed-off-by: Borislav Petkov --- drivers/edac/edac_device_sysfs.c | 5 ----- drivers/edac/edac_mc_sysfs.c | 11 +---------- drivers/edac/edac_pci_sysfs.c | 10 +--------- 3 files changed, 2 insertions(+), 24 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c index 170527165374..93da1a45c716 100644 --- a/drivers/edac/edac_device_sysfs.c +++ b/drivers/edac/edac_device_sysfs.c @@ -237,11 +237,6 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev) /* get the /sys/devices/system/edac reference */ edac_subsys = edac_get_sysfs_subsys(); - if (edac_subsys == NULL) { - edac_dbg(1, "no edac_subsys error\n"); - err = -ENODEV; - goto err_out; - } /* Point to the 'edac_subsys' this instance 'reports' to */ edac_dev->edac_subsys = edac_subsys; diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 1c79ae3e083a..26e65ab5932a 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -1025,24 +1025,15 @@ static struct device_type mc_attr_type = { */ int __init edac_mc_sysfs_init(void) { - struct bus_type *edac_subsys; int err; - /* get the /sys/devices/system/edac subsys reference */ - edac_subsys = edac_get_sysfs_subsys(); - if (edac_subsys == NULL) { - edac_dbg(1, "no edac_subsys\n"); - err = -EINVAL; - goto out; - } - mci_pdev = kzalloc(sizeof(*mci_pdev), GFP_KERNEL); if (!mci_pdev) { err = -ENOMEM; goto out; } - mci_pdev->bus = edac_subsys; + mci_pdev->bus = edac_get_sysfs_subsys(); mci_pdev->type = &mc_attr_type; device_initialize(mci_pdev); dev_set_name(mci_pdev, "mc"); diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index 262f56cca9ff..6e3428ba400f 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -331,10 +331,7 @@ static struct kobj_type ktype_edac_pci_main_kobj = { }; /** - * edac_pci_main_kobj_setup() - * - * setup the sysfs for EDAC PCI attributes - * assumes edac_subsys has already been initialized + * edac_pci_main_kobj_setup: Setup the sysfs for EDAC PCI attributes. */ static int edac_pci_main_kobj_setup(void) { @@ -351,11 +348,6 @@ static int edac_pci_main_kobj_setup(void) * controls and attributes */ edac_subsys = edac_get_sysfs_subsys(); - if (edac_subsys == NULL) { - edac_dbg(1, "no edac_subsys\n"); - err = -ENODEV; - goto decrement_count_fail; - } /* Bump the reference count on this module to ensure the * modules isn't unloaded until we deconstruct the top -- cgit v1.2.3-58-ga151 From e136fa016f2f06ca6e00d4f99894b4424f3f2a5c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 Nov 2015 15:07:28 +0100 Subject: EDAC: Make edac_device workqueue setup/teardown functions static They're not used anywhere else. Signed-off-by: Borislav Petkov --- drivers/edac/edac_device.c | 6 +++--- drivers/edac/edac_module.h | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 53587377e672..455a64b67521 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -402,8 +402,8 @@ static void edac_device_workq_function(struct work_struct *work_req) * initialize a workq item for this edac_device instance * passing in the new delay period in msec */ -void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev, - unsigned msec) +static void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev, + unsigned msec) { edac_dbg(0, "\n"); @@ -433,7 +433,7 @@ void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev, * edac_device_workq_teardown * stop the workq processing on this edac_dev */ -void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev) +static void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev) { if (!edac_dev->edac_check) return; diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index b95a48fc723d..7388abfbf10b 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -48,9 +48,6 @@ extern void edac_device_remove_sysfs(struct edac_device_ctl_info *edac_dev); /* edac core workqueue: single CPU mode */ extern struct workqueue_struct *edac_workqueue; -extern void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev, - unsigned msec); -extern void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev); extern void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev, unsigned long value); extern void edac_mc_reset_delay_period(unsigned long value); -- cgit v1.2.3-58-ga151 From c4cf3b454ecaa222aad9017932bd3b9c9325d931 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 Nov 2015 19:02:01 +0100 Subject: EDAC: Rework workqueue handling Hide the EDAC workqueue pointer in a separate compilation unit and add accessors for the workqueue manipulations needed. Remove edac_pci_reset_delay_period() which wasn't used by anything. It seems it got added without a user with 91b99041c1d5 ("drivers/edac: updated PCI monitoring") Signed-off-by: Borislav Petkov --- drivers/edac/Makefile | 2 +- drivers/edac/edac_device.c | 28 +++++++++++----------------- drivers/edac/edac_mc.c | 19 +++++++------------ drivers/edac/edac_module.c | 29 ----------------------------- drivers/edac/edac_module.h | 7 ++++++- drivers/edac/edac_pci.c | 32 ++++---------------------------- drivers/edac/wq.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 71 insertions(+), 88 deletions(-) create mode 100644 drivers/edac/wq.c (limited to 'drivers/edac') diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index dbf53e08bdd1..be163e20fe56 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_EDAC) := edac_stub.o obj-$(CONFIG_EDAC_MM_EDAC) += edac_core.o edac_core-y := edac_mc.o edac_device.o edac_mc_sysfs.o -edac_core-y += edac_module.o edac_device_sysfs.o +edac_core-y += edac_module.o edac_device_sysfs.o wq.o edac_core-$(CONFIG_EDAC_DEBUG) += debugfs.o diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 455a64b67521..a97900333e2d 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -390,11 +390,9 @@ static void edac_device_workq_function(struct work_struct *work_req) * between integral seconds */ if (edac_dev->poll_msec == 1000) - queue_delayed_work(edac_workqueue, &edac_dev->work, - round_jiffies_relative(edac_dev->delay)); + edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay)); else - queue_delayed_work(edac_workqueue, &edac_dev->work, - edac_dev->delay); + edac_queue_work(&edac_dev->work, edac_dev->delay); } /* @@ -422,11 +420,9 @@ static void edac_device_workq_setup(struct edac_device_ctl_info *edac_dev, * to fire together on the 1 second exactly */ if (edac_dev->poll_msec == 1000) - queue_delayed_work(edac_workqueue, &edac_dev->work, - round_jiffies_relative(edac_dev->delay)); + edac_queue_work(&edac_dev->work, round_jiffies_relative(edac_dev->delay)); else - queue_delayed_work(edac_workqueue, &edac_dev->work, - edac_dev->delay); + edac_queue_work(&edac_dev->work, edac_dev->delay); } /* @@ -440,8 +436,7 @@ static void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev) edac_dev->op_state = OP_OFFLINE; - cancel_delayed_work_sync(&edac_dev->work); - flush_workqueue(edac_workqueue); + edac_stop_work(&edac_dev->work); } /* @@ -454,16 +449,15 @@ static void edac_device_workq_teardown(struct edac_device_ctl_info *edac_dev) void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev, unsigned long value) { - /* cancel the current workq request, without the mutex lock */ - edac_device_workq_teardown(edac_dev); + unsigned long jiffs = msecs_to_jiffies(value); - /* acquire the mutex before doing the workq setup */ - mutex_lock(&device_ctls_mutex); + if (value == 1000) + jiffs = round_jiffies_relative(value); - /* restart the workq request, with new delay value */ - edac_device_workq_setup(edac_dev, value); + edac_dev->poll_msec = value; + edac_dev->delay = jiffs; - mutex_unlock(&device_ctls_mutex); + edac_mod_work(&edac_dev->work, jiffs); } /* diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 1b2c2187b347..8adfc167c2e3 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -548,8 +548,7 @@ static void edac_mc_workq_function(struct work_struct *work_req) mutex_unlock(&mem_ctls_mutex); /* Reschedule */ - queue_delayed_work(edac_workqueue, &mci->work, - msecs_to_jiffies(edac_mc_get_poll_msec())); + edac_queue_work(&mci->work, msecs_to_jiffies(edac_mc_get_poll_msec())); } /* @@ -561,8 +560,7 @@ static void edac_mc_workq_function(struct work_struct *work_req) * * called with the mem_ctls_mutex held */ -static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec, - bool init) +static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec) { edac_dbg(0, "\n"); @@ -570,10 +568,9 @@ static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec, if (mci->op_state != OP_RUNNING_POLL) return; - if (init) - INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); + INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); - mod_delayed_work(edac_workqueue, &mci->work, msecs_to_jiffies(msec)); + edac_queue_work(&mci->work, msecs_to_jiffies(msec)); } /* @@ -588,8 +585,7 @@ static void edac_mc_workq_teardown(struct mem_ctl_info *mci) { mci->op_state = OP_OFFLINE; - cancel_delayed_work_sync(&mci->work); - flush_workqueue(edac_workqueue); + edac_stop_work(&mci->work); } /* @@ -608,9 +604,8 @@ void edac_mc_reset_delay_period(unsigned long value) list_for_each(item, &mc_devices) { mci = list_entry(item, struct mem_ctl_info, link); - edac_mc_workq_setup(mci, value, false); + edac_mod_work(&mci->work, value); } - mutex_unlock(&mem_ctls_mutex); } @@ -781,7 +776,7 @@ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci, /* This instance is NOW RUNNING */ mci->op_state = OP_RUNNING_POLL; - edac_mc_workq_setup(mci, edac_mc_get_poll_msec(), true); + edac_mc_workq_setup(mci, edac_mc_get_poll_msec()); } else { mci->op_state = OP_RUNNING_INTERRUPT; } diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c index 2b53680a687d..5f8543be995a 100644 --- a/drivers/edac/edac_module.c +++ b/drivers/edac/edac_module.c @@ -43,9 +43,6 @@ module_param_call(edac_debug_level, edac_set_debug_level, param_get_int, MODULE_PARM_DESC(edac_debug_level, "EDAC debug level: [0-4], default: 2"); #endif -/* scope is to module level only */ -struct workqueue_struct *edac_workqueue; - /* * edac_op_state_to_string() */ @@ -65,32 +62,6 @@ char *edac_op_state_to_string(int opstate) return "UNKNOWN"; } -/* - * edac_workqueue_setup - * initialize the edac work queue for polling operations - */ -static int edac_workqueue_setup(void) -{ - edac_workqueue = create_singlethread_workqueue("edac-poller"); - if (edac_workqueue == NULL) - return -ENODEV; - else - return 0; -} - -/* - * edac_workqueue_teardown - * teardown the edac workqueue - */ -static void edac_workqueue_teardown(void) -{ - if (edac_workqueue) { - flush_workqueue(edac_workqueue); - destroy_workqueue(edac_workqueue); - edac_workqueue = NULL; - } -} - /* * sysfs object: /sys/devices/system/edac * need to export to other files diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index 7388abfbf10b..cfaacb99c973 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -47,7 +47,12 @@ extern int edac_device_create_sysfs(struct edac_device_ctl_info *edac_dev); extern void edac_device_remove_sysfs(struct edac_device_ctl_info *edac_dev); /* edac core workqueue: single CPU mode */ -extern struct workqueue_struct *edac_workqueue; +int edac_workqueue_setup(void); +void edac_workqueue_teardown(void); +bool edac_queue_work(struct delayed_work *work, unsigned long delay); +bool edac_stop_work(struct delayed_work *work); +bool edac_mod_work(struct delayed_work *work, unsigned long delay); + extern void edac_device_reset_delay_period(struct edac_device_ctl_info *edac_dev, unsigned long value); extern void edac_mc_reset_delay_period(unsigned long value); diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index d8b083190695..99685388d3fb 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -209,7 +209,7 @@ static void edac_pci_workq_function(struct work_struct *work_req) delay = msecs_to_jiffies(msec); /* Reschedule only if we are in POLL mode */ - queue_delayed_work(edac_workqueue, &pci->work, delay); + edac_queue_work(&pci->work, delay); } mutex_unlock(&edac_pci_ctls_mutex); @@ -229,8 +229,8 @@ static void edac_pci_workq_setup(struct edac_pci_ctl_info *pci, edac_dbg(0, "\n"); INIT_DELAYED_WORK(&pci->work, edac_pci_workq_function); - queue_delayed_work(edac_workqueue, &pci->work, - msecs_to_jiffies(edac_pci_get_poll_msec())); + + edac_queue_work(&pci->work, msecs_to_jiffies(edac_pci_get_poll_msec())); } /* @@ -243,32 +243,8 @@ static void edac_pci_workq_teardown(struct edac_pci_ctl_info *pci) pci->op_state = OP_OFFLINE; - cancel_delayed_work_sync(&pci->work); - flush_workqueue(edac_workqueue); -} - -/* - * edac_pci_reset_delay_period - * - * called with a new period value for the workq period - * a) stop current workq timer - * b) restart workq timer with new value - */ -void edac_pci_reset_delay_period(struct edac_pci_ctl_info *pci, - unsigned long value) -{ - edac_dbg(0, "\n"); - - edac_pci_workq_teardown(pci); - - /* need to lock for the setup */ - mutex_lock(&edac_pci_ctls_mutex); - - edac_pci_workq_setup(pci, value); - - mutex_unlock(&edac_pci_ctls_mutex); + edac_stop_work(&pci->work); } -EXPORT_SYMBOL_GPL(edac_pci_reset_delay_period); /* * edac_pci_alloc_index: Allocate a unique PCI index number diff --git a/drivers/edac/wq.c b/drivers/edac/wq.c new file mode 100644 index 000000000000..1b8c07e44fd8 --- /dev/null +++ b/drivers/edac/wq.c @@ -0,0 +1,42 @@ +#include "edac_module.h" + +static struct workqueue_struct *wq; + +bool edac_queue_work(struct delayed_work *work, unsigned long delay) +{ + return queue_delayed_work(wq, work, delay); +} +EXPORT_SYMBOL_GPL(edac_queue_work); + +bool edac_mod_work(struct delayed_work *work, unsigned long delay) +{ + return mod_delayed_work(wq, work, delay); +} +EXPORT_SYMBOL_GPL(edac_mod_work); + +bool edac_stop_work(struct delayed_work *work) +{ + bool ret; + + ret = cancel_delayed_work_sync(work); + flush_workqueue(wq); + + return ret; +} +EXPORT_SYMBOL_GPL(edac_stop_work); + +int edac_workqueue_setup(void) +{ + wq = create_singlethread_workqueue("edac-poller"); + if (!wq) + return -ENODEV; + else + return 0; +} + +void edac_workqueue_teardown(void) +{ + flush_workqueue(wq); + destroy_workqueue(wq); + wq = NULL; +} -- cgit v1.2.3-58-ga151 From 45f4d3ab3ee8d12471e47057ea4d0d86167b25a0 Mon Sep 17 00:00:00 2001 From: Hubert Chrzaniuk Date: Fri, 11 Dec 2015 14:21:22 +0100 Subject: EDAC, sb_edac: Set fixed DIMM width on Xeon Knights Landing Knights Landing does not come with register that could be used to fetch DIMM width. However the value is fixed for this architecture so it can be hardcoded. Signed-off-by: Hubert Chrzaniuk Cc: Doug Thompson Cc: Mauro Carvalho Chehab Cc: linux-edac Cc: lukasz.anaczkowski@intel.com Link: http://lkml.kernel.org/r/1449840082-18673-1-git-send-email-hubert.chrzaniuk@intel.com Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/edac') diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index b3d924da5985..e438ee5b433f 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -924,6 +924,12 @@ out: return mtype; } +static enum dev_type knl_get_width(struct sbridge_pvt *pvt, u32 mtr) +{ + /* for KNL value is fixed */ + return DEV_X16; +} + static enum dev_type sbridge_get_width(struct sbridge_pvt *pvt, u32 mtr) { /* there's no way to figure out */ @@ -3393,7 +3399,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.interleave_list = knl_interleave_list; pvt->info.max_interleave = ARRAY_SIZE(knl_interleave_list); pvt->info.interleave_pkg = ibridge_interleave_pkg; - pvt->info.get_width = ibridge_get_width; + pvt->info.get_width = knl_get_width; mci->ctl_name = kasprintf(GFP_KERNEL, "Knights Landing Socket#%d", mci->mc_idx); -- cgit v1.2.3-58-ga151 From 1cac5503fbf751f121d0c9f96e69d1fdd3eb1471 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 1 Jan 2016 22:59:07 +0800 Subject: EDAC, i5100: Use to_delayed_work() Use to_delayed_work() instead of open-coding it. Signed-off-by: Geliang Tang Cc: linux-edac Link: http://lkml.kernel.org/r/58c0e319c7263a10b692100c657c06c42814aecf.1451659910.git.geliangtang@163.com Signed-off-by: Borislav Petkov --- drivers/edac/i5100_edac.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/i5100_edac.c b/drivers/edac/i5100_edac.c index 40917775dca1..c655162caf08 100644 --- a/drivers/edac/i5100_edac.c +++ b/drivers/edac/i5100_edac.c @@ -575,9 +575,7 @@ static void i5100_check_error(struct mem_ctl_info *mci) static void i5100_refresh_scrubbing(struct work_struct *work) { - struct delayed_work *i5100_scrubbing = container_of(work, - struct delayed_work, - work); + struct delayed_work *i5100_scrubbing = to_delayed_work(work); struct i5100_priv *priv = container_of(i5100_scrubbing, struct i5100_priv, i5100_scrubbing); -- cgit v1.2.3-58-ga151 From 6f3508f61c814ee852c199988a62bd954c50dfc1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jan 2016 12:54:51 +0300 Subject: EDAC, amd64_edac: Shift wrapping issue in f1x_get_norm_dct_addr() dct_sel_base_off is declared as a u64 but we're only using the lower 32 bits because of a shift wrapping bug. This can possibly truncate the upper 16 bits of DctSelBaseOffset[47:26], causing us to misdecode the CS row. Fixes: c8e518d5673d ('amd64_edac: Sanitize f10_get_base_addr_offset') Signed-off-by: Dan Carpenter Cc: Aravind Gopalakrishnan Cc: linux-edac Cc: Link: http://lkml.kernel.org/r/20160120095451.GB19898@mwanda Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/edac') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9eee13ef83a5..d87a47547ba5 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1452,7 +1452,7 @@ static u64 f1x_get_norm_dct_addr(struct amd64_pvt *pvt, u8 range, u64 chan_off; u64 dram_base = get_dram_base(pvt, range); u64 hole_off = f10_dhar_offset(pvt); - u64 dct_sel_base_off = (pvt->dct_sel_hi & 0xFFFFFC00) << 16; + u64 dct_sel_base_off = (u64)(pvt->dct_sel_hi & 0xFFFFFC00) << 16; if (hi_rng) { /* -- cgit v1.2.3-58-ga151 From 4d67e3ce75f97148ed127c53d6b0fc854d8b3867 Mon Sep 17 00:00:00 2001 From: Loc Ho Date: Fri, 22 Jan 2016 13:47:04 -0700 Subject: EDAC, xgene: Add missing SoC register bus error handling Add missing register bus error handling for APM X-Gene EDAC SoC and fix a checking condition for CE error promoted to UE. Signed-off-by: Loc Ho Cc: Arnd Bergmann Cc: devicetree@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: patches@apm.com Link: http://lkml.kernel.org/r/1453495625-28006-3-git-send-email-lho@apm.com Signed-off-by: Borislav Petkov --- drivers/edac/xgene_edac.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) (limited to 'drivers/edac') diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c index 41f876414a18..bf19b6e3bd12 100644 --- a/drivers/edac/xgene_edac.c +++ b/drivers/edac/xgene_edac.c @@ -61,6 +61,7 @@ struct xgene_edac { struct regmap *mcba_map; struct regmap *mcbb_map; struct regmap *efuse_map; + struct regmap *rb_map; void __iomem *pcp_csr; spinlock_t lock; struct dentry *dfs; @@ -1057,7 +1058,7 @@ static bool xgene_edac_l3_promote_to_uc_err(u32 l3cesr, u32 l3celr) case 0x041: return true; } - } else if (L3C_ELR_ERRSYN(l3celr) == 9) + } else if (L3C_ELR_ERRWAY(l3celr) == 9) return true; return false; @@ -1353,6 +1354,17 @@ static int xgene_edac_l3_remove(struct xgene_edac_dev_ctx *l3) #define GLBL_MDED_ERRH 0x0848 #define GLBL_MDED_ERRHMASK 0x084c +/* IO Bus Registers */ +#define RBCSR 0x0000 +#define STICKYERR_MASK BIT(0) +#define RBEIR 0x0008 +#define AGENT_OFFLINE_ERR_MASK BIT(30) +#define UNIMPL_RBPAGE_ERR_MASK BIT(29) +#define WORD_ALIGNED_ERR_MASK BIT(28) +#define PAGE_ACCESS_ERR_MASK BIT(27) +#define WRITE_ACCESS_MASK BIT(26) +#define RBERRADDR_RD(src) ((src) & 0x03FFFFFF) + static const char * const soc_mem_err_v1[] = { "10GbE0", "10GbE1", @@ -1470,6 +1482,51 @@ static void xgene_edac_rb_report(struct edac_device_ctl_info *edac_dev) u32 err_addr_hi; u32 reg; + /* If the register bus resource isn't available, just skip it */ + if (!ctx->edac->rb_map) + goto rb_skip; + + /* + * Check RB access errors + * 1. Out of range + * 2. Un-implemented page + * 3. Un-aligned access + * 4. Offline slave IP + */ + if (regmap_read(ctx->edac->rb_map, RBCSR, ®)) + return; + if (reg & STICKYERR_MASK) { + bool write; + u32 address; + + dev_err(edac_dev->dev, "IOB bus access error(s)\n"); + if (regmap_read(ctx->edac->rb_map, RBEIR, ®)) + return; + write = reg & WRITE_ACCESS_MASK ? 1 : 0; + address = RBERRADDR_RD(reg); + if (reg & AGENT_OFFLINE_ERR_MASK) + dev_err(edac_dev->dev, + "IOB bus %s access to offline agent error\n", + write ? "write" : "read"); + if (reg & UNIMPL_RBPAGE_ERR_MASK) + dev_err(edac_dev->dev, + "IOB bus %s access to unimplemented page error\n", + write ? "write" : "read"); + if (reg & WORD_ALIGNED_ERR_MASK) + dev_err(edac_dev->dev, + "IOB bus %s word aligned access error\n", + write ? "write" : "read"); + if (reg & PAGE_ACCESS_ERR_MASK) + dev_err(edac_dev->dev, + "IOB bus %s to page out of range access error\n", + write ? "write" : "read"); + if (regmap_write(ctx->edac->rb_map, RBEIR, 0)) + return; + if (regmap_write(ctx->edac->rb_map, RBCSR, 0)) + return; + } +rb_skip: + /* IOB Bridge agent transaction error interrupt */ reg = readl(ctx->dev_csr + IOBBATRANSERRINTSTS); if (!reg) @@ -1852,6 +1909,17 @@ static int xgene_edac_probe(struct platform_device *pdev) goto out_err; } + /* + * NOTE: The register bus resource is optional for compatibility + * reason. + */ + edac->rb_map = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, + "regmap-rb"); + if (IS_ERR(edac->rb_map)) { + dev_warn(edac->dev, "missing syscon regmap rb\n"); + edac->rb_map = NULL; + } + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); edac->pcp_csr = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(edac->pcp_csr)) { -- cgit v1.2.3-58-ga151 From 096676061987c613bdacddbae838cb63a815db94 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Feb 2016 10:59:53 +0100 Subject: EDAC: Balance workqueue setup and teardown We use the ->edac_check function pointers to determine whether we need to setup a polling workqueue. However, the destroy path is not balanced and we might try to teardown an unitialized workqueue. Balance init and destroy paths by looking at ->edac_check in both cases. Set op_state to OP_OFFLINE *before* destroying anything. Reported-by: Zhiqiang Hou Cc: Varun Sethi Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc.c | 15 +++++++-------- drivers/edac/edac_pci.c | 8 +++----- 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 8adfc167c2e3..50802c154915 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -583,8 +583,6 @@ static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec) */ static void edac_mc_workq_teardown(struct mem_ctl_info *mci) { - mci->op_state = OP_OFFLINE; - edac_stop_work(&mci->work); } @@ -772,7 +770,7 @@ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci, } /* If there IS a check routine, then we are running POLLED */ - if (mci->edac_check != NULL) { + if (mci->edac_check) { /* This instance is NOW RUNNING */ mci->op_state = OP_RUNNING_POLL; @@ -823,15 +821,16 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev) return NULL; } + /* mark MCI offline: */ + mci->op_state = OP_OFFLINE; + if (!del_mc_from_global_list(mci)) edac_mc_owner = NULL; - mutex_unlock(&mem_ctls_mutex); - /* flush workq processes */ - edac_mc_workq_teardown(mci); + mutex_unlock(&mem_ctls_mutex); - /* marking MCI offline */ - mci->op_state = OP_OFFLINE; + if (mci->edac_check) + edac_mc_workq_teardown(mci); /* remove from sysfs */ edac_remove_sysfs_mci_device(mci); diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index 99685388d3fb..f0e8c3d01ed5 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -241,8 +241,6 @@ static void edac_pci_workq_teardown(struct edac_pci_ctl_info *pci) { edac_dbg(0, "\n"); - pci->op_state = OP_OFFLINE; - edac_stop_work(&pci->work); } @@ -289,7 +287,7 @@ int edac_pci_add_device(struct edac_pci_ctl_info *pci, int edac_idx) goto fail1; } - if (pci->edac_check != NULL) { + if (pci->edac_check) { pci->op_state = OP_RUNNING_POLL; edac_pci_workq_setup(pci, 1000); @@ -350,8 +348,8 @@ struct edac_pci_ctl_info *edac_pci_del_device(struct device *dev) mutex_unlock(&edac_pci_ctls_mutex); - /* stop the workq timer */ - edac_pci_workq_teardown(pci); + if (pci->edac_check) + edac_pci_workq_teardown(pci); edac_printk(KERN_INFO, EDAC_PCI, "Removed device %d for %s %s: DEV %s\n", -- cgit v1.2.3-58-ga151 From 626a7a4dba2ad672b66b675f961de214f0ad0c74 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Feb 2016 11:06:41 +0100 Subject: EDAC: Kill workqueue setup/teardown functions We have the generic wrappers now, use those. edac_pci_workq_setup() had an unused argument anyway. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc.c | 43 ++++--------------------------------------- drivers/edac/edac_pci.c | 35 ++++------------------------------- 2 files changed, 8 insertions(+), 70 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 50802c154915..050493edb505 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -551,41 +551,6 @@ static void edac_mc_workq_function(struct work_struct *work_req) edac_queue_work(&mci->work, msecs_to_jiffies(edac_mc_get_poll_msec())); } -/* - * edac_mc_workq_setup - * initialize a workq item for this mci - * passing in the new delay period in msec - * - * locking model: - * - * called with the mem_ctls_mutex held - */ -static void edac_mc_workq_setup(struct mem_ctl_info *mci, unsigned msec) -{ - edac_dbg(0, "\n"); - - /* if this instance is not in the POLL state, then simply return */ - if (mci->op_state != OP_RUNNING_POLL) - return; - - INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); - - edac_queue_work(&mci->work, msecs_to_jiffies(msec)); -} - -/* - * edac_mc_workq_teardown - * stop the workq processing on this mci - * - * locking model: - * - * called WITHOUT lock held - */ -static void edac_mc_workq_teardown(struct mem_ctl_info *mci) -{ - edac_stop_work(&mci->work); -} - /* * edac_mc_reset_delay_period(unsigned long value) * @@ -769,12 +734,12 @@ int edac_mc_add_mc_with_groups(struct mem_ctl_info *mci, goto fail1; } - /* If there IS a check routine, then we are running POLLED */ if (mci->edac_check) { - /* This instance is NOW RUNNING */ mci->op_state = OP_RUNNING_POLL; - edac_mc_workq_setup(mci, edac_mc_get_poll_msec()); + INIT_DELAYED_WORK(&mci->work, edac_mc_workq_function); + edac_queue_work(&mci->work, msecs_to_jiffies(edac_mc_get_poll_msec())); + } else { mci->op_state = OP_RUNNING_INTERRUPT; } @@ -830,7 +795,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev) mutex_unlock(&mem_ctls_mutex); if (mci->edac_check) - edac_mc_workq_teardown(mci); + edac_stop_work(&mci->work); /* remove from sysfs */ edac_remove_sysfs_mci_device(mci); diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index f0e8c3d01ed5..79945e0df0dc 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -215,35 +215,6 @@ static void edac_pci_workq_function(struct work_struct *work_req) mutex_unlock(&edac_pci_ctls_mutex); } -/* - * edac_pci_workq_setup() - * initialize a workq item for this edac_pci instance - * passing in the new delay period in msec - * - * locking model: - * called when 'edac_pci_ctls_mutex' is locked - */ -static void edac_pci_workq_setup(struct edac_pci_ctl_info *pci, - unsigned int msec) -{ - edac_dbg(0, "\n"); - - INIT_DELAYED_WORK(&pci->work, edac_pci_workq_function); - - edac_queue_work(&pci->work, msecs_to_jiffies(edac_pci_get_poll_msec())); -} - -/* - * edac_pci_workq_teardown() - * stop the workq processing on this edac_pci instance - */ -static void edac_pci_workq_teardown(struct edac_pci_ctl_info *pci) -{ - edac_dbg(0, "\n"); - - edac_stop_work(&pci->work); -} - /* * edac_pci_alloc_index: Allocate a unique PCI index number * @@ -290,7 +261,9 @@ int edac_pci_add_device(struct edac_pci_ctl_info *pci, int edac_idx) if (pci->edac_check) { pci->op_state = OP_RUNNING_POLL; - edac_pci_workq_setup(pci, 1000); + INIT_DELAYED_WORK(&pci->work, edac_pci_workq_function); + edac_queue_work(&pci->work, msecs_to_jiffies(edac_pci_get_poll_msec())); + } else { pci->op_state = OP_RUNNING_INTERRUPT; } @@ -349,7 +322,7 @@ struct edac_pci_ctl_info *edac_pci_del_device(struct device *dev) mutex_unlock(&edac_pci_ctls_mutex); if (pci->edac_check) - edac_pci_workq_teardown(pci); + edac_stop_work(&pci->work); edac_printk(KERN_INFO, EDAC_PCI, "Removed device %d for %s %s: DEV %s\n", -- cgit v1.2.3-58-ga151 From 06e912d4d4c2624c169997e26b3d7b5746735a14 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Feb 2016 11:36:11 +0100 Subject: EDAC: Cleanup/sync workqueue functions They're both running only when ->edac_check is initialized so remove that check from the workqueue function itself. Synchronize/generalize the ->op_state check between the two. Kill useless comments, while at it. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc.c | 8 +++----- drivers/edac/edac_pci.c | 30 +++++++++++++++--------------- 2 files changed, 18 insertions(+), 20 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 050493edb505..1472f48c8ac6 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -535,19 +535,17 @@ static void edac_mc_workq_function(struct work_struct *work_req) mutex_lock(&mem_ctls_mutex); - /* if this control struct has movd to offline state, we are done */ - if (mci->op_state == OP_OFFLINE) { + if (mci->op_state != OP_RUNNING_POLL) { mutex_unlock(&mem_ctls_mutex); return; } - /* Only poll controllers that are running polled and have a check */ - if (edac_mc_assert_error_check_and_clear() && (mci->edac_check != NULL)) + if (edac_mc_assert_error_check_and_clear()) mci->edac_check(mci); mutex_unlock(&mem_ctls_mutex); - /* Reschedule */ + /* Queue ourselves again. */ edac_queue_work(&mci->work, msecs_to_jiffies(edac_mc_get_poll_msec())); } diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index 79945e0df0dc..8f2f2899a7a2 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -195,23 +195,23 @@ static void edac_pci_workq_function(struct work_struct *work_req) mutex_lock(&edac_pci_ctls_mutex); - if (pci->op_state == OP_RUNNING_POLL) { - /* we might be in POLL mode, but there may NOT be a poll func - */ - if ((pci->edac_check != NULL) && edac_pci_get_check_errors()) - pci->edac_check(pci); - - /* if we are on a one second period, then use round */ - msec = edac_pci_get_poll_msec(); - if (msec == 1000) - delay = round_jiffies_relative(msecs_to_jiffies(msec)); - else - delay = msecs_to_jiffies(msec); - - /* Reschedule only if we are in POLL mode */ - edac_queue_work(&pci->work, delay); + if (pci->op_state != OP_RUNNING_POLL) { + mutex_unlock(&edac_pci_ctls_mutex); + return; } + if (edac_pci_get_check_errors()) + pci->edac_check(pci); + + /* if we are on a one second period, then use round */ + msec = edac_pci_get_poll_msec(); + if (msec == 1000) + delay = round_jiffies_relative(msecs_to_jiffies(msec)); + else + delay = msecs_to_jiffies(msec); + + edac_queue_work(&pci->work, delay); + mutex_unlock(&edac_pci_ctls_mutex); } -- cgit v1.2.3-58-ga151 From f2b59ac66f980e7636c15e2ebbd09f0ef46881ab Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 2 Feb 2016 21:09:33 +0530 Subject: EDAC, mpc85xx: Silence unused variable warning We were getting this build warning: drivers/edac/mpc85xx_edac.c:1247:6: warning: unused variable 'pvr' pvr is only used if CONFIG_FSL_SOC_BOOKE is defined. Declare it __maybe_unused. Suggested-by: Guenter Roeck Signed-off-by: Sudip Mukherjee Reviewed-by: Johannes Thumshirn Cc: linux-edac Link: http://lkml.kernel.org/r/1454427573-7994-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Borislav Petkov --- drivers/edac/mpc85xx_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/edac') diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index b7139c160baf..ca63d0da8889 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -1244,7 +1244,7 @@ static struct platform_driver * const drivers[] = { static int __init mpc85xx_mc_init(void) { int res = 0; - u32 pvr = 0; + u32 __maybe_unused pvr = 0; printk(KERN_INFO "Freescale(R) MPC85xx EDAC driver, " "(C) 2006 Montavista Software\n"); -- cgit v1.2.3-58-ga151 From 9bf4f005672073f6bae2edf84e6cb5c4fb16ffc6 Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Tue, 9 Feb 2016 18:29:25 -0600 Subject: EDAC: Use edac_debugfs_remove_recursive() in edac_debugfs_exit() debugfs_remove() is used to remove a file or a directory from the debugfs filesystem on an EDAC device exit. However edac_debugfs might not be empty. This is similar to 30f84a891bf6 ("EDAC: Use edac_debugfs_remove_recursive()") which changed the EDAC MCI code to use edac_debugfs_remove_recursive(). Suggested-by: Borislav Petkov Signed-off-by: Thor Thayer Cc: linux-edac Link: http://lkml.kernel.org/r/1455064165-3816-1-git-send-email-tthayer@opensource.altera.com Signed-off-by: Borislav Petkov --- drivers/edac/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/edac') diff --git a/drivers/edac/debugfs.c b/drivers/edac/debugfs.c index 54d2f668cb0a..92dbb7e2320c 100644 --- a/drivers/edac/debugfs.c +++ b/drivers/edac/debugfs.c @@ -53,7 +53,7 @@ int __init edac_debugfs_init(void) void edac_debugfs_exit(void) { - debugfs_remove(edac_debugfs); + debugfs_remove_recursive(edac_debugfs); } int edac_create_debugfs_nodes(struct mem_ctl_info *mci) -- cgit v1.2.3-58-ga151 From c3eea1942a16db52ebea0382bd5826f75b9b7e9b Mon Sep 17 00:00:00 2001 From: Thor Thayer Date: Wed, 10 Feb 2016 13:26:21 -0600 Subject: EDAC, altera: Add Altera L2 cache and OCRAM support Add L2 Cache and On-Chip RAM EDAC support for the Altera SoCs. The SDRAM controller is using the Memory Controller model. Each type of ECC is individually configurable. Signed-off-by: Thor Thayer Cc: devicetree@vger.kernel.org Cc: dinguyen@opensource.altera.com Cc: galak@codeaurora.org Cc: grant.likely@linaro.org Cc: ijc+devicetree@hellion.org.uk Cc: linux-arm-kernel@lists.infradead.org Cc: linux@arm.linux.org.uk Cc: linux-doc@vger.kernel.org Cc: linux-edac Cc: mark.rutland@arm.com Cc: Mauro Carvalho Chehab Cc: pawel.moll@arm.com Cc: robh+dt@kernel.org Link: http://lkml.kernel.org/r/1455132384-17108-1-git-send-email-tthayer@opensource.altera.com Signed-off-by: Borislav Petkov --- drivers/edac/Kconfig | 26 ++- drivers/edac/Makefile | 2 +- drivers/edac/altera_edac.c | 492 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 512 insertions(+), 8 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index ef25000a5bc6..37755e63cc28 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -367,14 +367,30 @@ config EDAC_OCTEON_PCI Support for error detection and correction on the Cavium Octeon family of SOCs. -config EDAC_ALTERA_MC - bool "Altera SDRAM Memory Controller EDAC" +config EDAC_ALTERA + bool "Altera SOCFPGA ECC" depends on EDAC_MM_EDAC=y && ARCH_SOCFPGA help Support for error detection and correction on the - Altera SDRAM memory controller. Note that the - preloader must initialize the SDRAM before loading - the kernel. + Altera SOCs. This must be selected for SDRAM ECC. + Note that the preloader must initialize the SDRAM + before loading the kernel. + +config EDAC_ALTERA_L2C + bool "Altera L2 Cache ECC" + depends on EDAC_ALTERA=y + select CACHE_L2X0 + help + Support for error detection and correction on the + Altera L2 cache Memory for Altera SoCs. This option + requires L2 cache so it will force that selection. + +config EDAC_ALTERA_OCRAM + bool "Altera On-Chip RAM ECC" + depends on EDAC_ALTERA=y && SRAM && GENERIC_ALLOCATOR + help + Support for error detection and correction on the + Altera On-Chip RAM Memory for Altera SoCs. config EDAC_SYNOPSYS tristate "Synopsys DDR Memory Controller" diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index be163e20fe56..f9e4a3e0e6e9 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -67,6 +67,6 @@ obj-$(CONFIG_EDAC_OCTEON_L2C) += octeon_edac-l2c.o obj-$(CONFIG_EDAC_OCTEON_LMC) += octeon_edac-lmc.o obj-$(CONFIG_EDAC_OCTEON_PCI) += octeon_edac-pci.o -obj-$(CONFIG_EDAC_ALTERA_MC) += altera_edac.o +obj-$(CONFIG_EDAC_ALTERA) += altera_edac.o obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 929640981d8a..63e42098726d 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -1,5 +1,5 @@ /* - * Copyright Altera Corporation (C) 2014-2015. All rights reserved. + * Copyright Altera Corporation (C) 2014-2016. All rights reserved. * Copyright 2011-2012 Calxeda, Inc. * * This program is free software; you can redistribute it and/or modify it @@ -17,8 +17,10 @@ * Adapted from the highbank_mc_edac driver. */ +#include #include #include +#include #include #include #include @@ -34,6 +36,7 @@ #define EDAC_MOD_STR "altera_edac" #define EDAC_VERSION "1" +#define EDAC_DEVICE "Altera" static const struct altr_sdram_prv_data c5_data = { .ecc_ctrl_offset = CV_CTLCFG_OFST, @@ -75,6 +78,31 @@ static const struct altr_sdram_prv_data a10_data = { .ue_set_mask = A10_DIAGINT_TDERRA_MASK, }; +/************************** EDAC Device Defines **************************/ + +/* OCRAM ECC Management Group Defines */ +#define ALTR_MAN_GRP_OCRAM_ECC_OFFSET 0x04 +#define ALTR_OCR_ECC_EN BIT(0) +#define ALTR_OCR_ECC_INJS BIT(1) +#define ALTR_OCR_ECC_INJD BIT(2) +#define ALTR_OCR_ECC_SERR BIT(3) +#define ALTR_OCR_ECC_DERR BIT(4) + +/* L2 ECC Management Group Defines */ +#define ALTR_MAN_GRP_L2_ECC_OFFSET 0x00 +#define ALTR_L2_ECC_EN BIT(0) +#define ALTR_L2_ECC_INJS BIT(1) +#define ALTR_L2_ECC_INJD BIT(2) + +#define ALTR_UE_TRIGGER_CHAR 'U' /* Trigger for UE */ +#define ALTR_TRIGGER_READ_WRD_CNT 32 /* Line size x 4 */ +#define ALTR_TRIG_OCRAM_BYTE_SIZE 128 /* Line size x 4 */ +#define ALTR_TRIG_L2C_BYTE_SIZE 4096 /* Full Page */ + +/*********************** EDAC Memory Controller Functions ****************/ + +/* The SDRAM controller uses the EDAC Memory Controller framework. */ + static irqreturn_t altr_sdram_mc_err_handler(int irq, void *dev_id) { struct mem_ctl_info *mci = dev_id; @@ -504,6 +532,466 @@ static struct platform_driver altr_sdram_edac_driver = { module_platform_driver(altr_sdram_edac_driver); +/************************* EDAC Parent Probe *************************/ + +static const struct of_device_id altr_edac_device_of_match[]; + +static const struct of_device_id altr_edac_of_match[] = { + { .compatible = "altr,socfpga-ecc-manager" }, + {}, +}; +MODULE_DEVICE_TABLE(of, altr_edac_of_match); + +static int altr_edac_probe(struct platform_device *pdev) +{ + of_platform_populate(pdev->dev.of_node, altr_edac_device_of_match, + NULL, &pdev->dev); + return 0; +} + +static struct platform_driver altr_edac_driver = { + .probe = altr_edac_probe, + .driver = { + .name = "socfpga_ecc_manager", + .of_match_table = altr_edac_of_match, + }, +}; +module_platform_driver(altr_edac_driver); + +/************************* EDAC Device Functions *************************/ + +/* + * EDAC Device Functions (shared between various IPs). + * The discrete memories use the EDAC Device framework. The probe + * and error handling functions are very similar between memories + * so they are shared. The memory allocation and freeing for EDAC + * trigger testing are different for each memory. + */ + +const struct edac_device_prv_data ocramecc_data; +const struct edac_device_prv_data l2ecc_data; + +struct edac_device_prv_data { + int (*setup)(struct platform_device *pdev, void __iomem *base); + int ce_clear_mask; + int ue_clear_mask; + char dbgfs_name[20]; + void * (*alloc_mem)(size_t size, void **other); + void (*free_mem)(void *p, size_t size, void *other); + int ecc_enable_mask; + int ce_set_mask; + int ue_set_mask; + int trig_alloc_sz; +}; + +struct altr_edac_device_dev { + void __iomem *base; + int sb_irq; + int db_irq; + const struct edac_device_prv_data *data; + struct dentry *debugfs_dir; + char *edac_dev_name; +}; + +static irqreturn_t altr_edac_device_handler(int irq, void *dev_id) +{ + irqreturn_t ret_value = IRQ_NONE; + struct edac_device_ctl_info *dci = dev_id; + struct altr_edac_device_dev *drvdata = dci->pvt_info; + const struct edac_device_prv_data *priv = drvdata->data; + + if (irq == drvdata->sb_irq) { + if (priv->ce_clear_mask) + writel(priv->ce_clear_mask, drvdata->base); + edac_device_handle_ce(dci, 0, 0, drvdata->edac_dev_name); + ret_value = IRQ_HANDLED; + } else if (irq == drvdata->db_irq) { + if (priv->ue_clear_mask) + writel(priv->ue_clear_mask, drvdata->base); + edac_device_handle_ue(dci, 0, 0, drvdata->edac_dev_name); + panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n"); + ret_value = IRQ_HANDLED; + } else { + WARN_ON(1); + } + + return ret_value; +} + +static ssize_t altr_edac_device_trig(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) + +{ + u32 *ptemp, i, error_mask; + int result = 0; + u8 trig_type; + unsigned long flags; + struct edac_device_ctl_info *edac_dci = file->private_data; + struct altr_edac_device_dev *drvdata = edac_dci->pvt_info; + const struct edac_device_prv_data *priv = drvdata->data; + void *generic_ptr = edac_dci->dev; + + if (!user_buf || get_user(trig_type, user_buf)) + return -EFAULT; + + if (!priv->alloc_mem) + return -ENOMEM; + + /* + * Note that generic_ptr is initialized to the device * but in + * some alloc_functions, this is overridden and returns data. + */ + ptemp = priv->alloc_mem(priv->trig_alloc_sz, &generic_ptr); + if (!ptemp) { + edac_printk(KERN_ERR, EDAC_DEVICE, + "Inject: Buffer Allocation error\n"); + return -ENOMEM; + } + + if (trig_type == ALTR_UE_TRIGGER_CHAR) + error_mask = priv->ue_set_mask; + else + error_mask = priv->ce_set_mask; + + edac_printk(KERN_ALERT, EDAC_DEVICE, + "Trigger Error Mask (0x%X)\n", error_mask); + + local_irq_save(flags); + /* write ECC corrupted data out. */ + for (i = 0; i < (priv->trig_alloc_sz / sizeof(*ptemp)); i++) { + /* Read data so we're in the correct state */ + rmb(); + if (ACCESS_ONCE(ptemp[i])) + result = -1; + /* Toggle Error bit (it is latched), leave ECC enabled */ + writel(error_mask, drvdata->base); + writel(priv->ecc_enable_mask, drvdata->base); + ptemp[i] = i; + } + /* Ensure it has been written out */ + wmb(); + local_irq_restore(flags); + + if (result) + edac_printk(KERN_ERR, EDAC_DEVICE, "Mem Not Cleared\n"); + + /* Read out written data. ECC error caused here */ + for (i = 0; i < ALTR_TRIGGER_READ_WRD_CNT; i++) + if (ACCESS_ONCE(ptemp[i]) != i) + edac_printk(KERN_ERR, EDAC_DEVICE, + "Read doesn't match written data\n"); + + if (priv->free_mem) + priv->free_mem(ptemp, priv->trig_alloc_sz, generic_ptr); + + return count; +} + +static const struct file_operations altr_edac_device_inject_fops = { + .open = simple_open, + .write = altr_edac_device_trig, + .llseek = generic_file_llseek, +}; + +static void altr_create_edacdev_dbgfs(struct edac_device_ctl_info *edac_dci, + const struct edac_device_prv_data *priv) +{ + struct altr_edac_device_dev *drvdata = edac_dci->pvt_info; + + if (!IS_ENABLED(CONFIG_EDAC_DEBUG)) + return; + + drvdata->debugfs_dir = edac_debugfs_create_dir(drvdata->edac_dev_name); + if (!drvdata->debugfs_dir) + return; + + if (!edac_debugfs_create_file(priv->dbgfs_name, S_IWUSR, + drvdata->debugfs_dir, edac_dci, + &altr_edac_device_inject_fops)) + debugfs_remove_recursive(drvdata->debugfs_dir); +} + +static const struct of_device_id altr_edac_device_of_match[] = { +#ifdef CONFIG_EDAC_ALTERA_L2C + { .compatible = "altr,socfpga-l2-ecc", .data = (void *)&l2ecc_data }, +#endif +#ifdef CONFIG_EDAC_ALTERA_OCRAM + { .compatible = "altr,socfpga-ocram-ecc", + .data = (void *)&ocramecc_data }, +#endif + {}, +}; +MODULE_DEVICE_TABLE(of, altr_edac_device_of_match); + +/* + * altr_edac_device_probe() + * This is a generic EDAC device driver that will support + * various Altera memory devices such as the L2 cache ECC and + * OCRAM ECC as well as the memories for other peripherals. + * Module specific initialization is done by passing the + * function index in the device tree. + */ +static int altr_edac_device_probe(struct platform_device *pdev) +{ + struct edac_device_ctl_info *dci; + struct altr_edac_device_dev *drvdata; + struct resource *r; + int res = 0; + struct device_node *np = pdev->dev.of_node; + char *ecc_name = (char *)np->name; + static int dev_instance; + + if (!devres_open_group(&pdev->dev, NULL, GFP_KERNEL)) { + edac_printk(KERN_ERR, EDAC_DEVICE, + "Unable to open devm\n"); + return -ENOMEM; + } + + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!r) { + edac_printk(KERN_ERR, EDAC_DEVICE, + "Unable to get mem resource\n"); + res = -ENODEV; + goto fail; + } + + if (!devm_request_mem_region(&pdev->dev, r->start, resource_size(r), + dev_name(&pdev->dev))) { + edac_printk(KERN_ERR, EDAC_DEVICE, + "%s:Error requesting mem region\n", ecc_name); + res = -EBUSY; + goto fail; + } + + dci = edac_device_alloc_ctl_info(sizeof(*drvdata), ecc_name, + 1, ecc_name, 1, 0, NULL, 0, + dev_instance++); + + if (!dci) { + edac_printk(KERN_ERR, EDAC_DEVICE, + "%s: Unable to allocate EDAC device\n", ecc_name); + res = -ENOMEM; + goto fail; + } + + drvdata = dci->pvt_info; + dci->dev = &pdev->dev; + platform_set_drvdata(pdev, dci); + drvdata->edac_dev_name = ecc_name; + + drvdata->base = devm_ioremap(&pdev->dev, r->start, resource_size(r)); + if (!drvdata->base) + goto fail1; + + /* Get driver specific data for this EDAC device */ + drvdata->data = of_match_node(altr_edac_device_of_match, np)->data; + + /* Check specific dependencies for the module */ + if (drvdata->data->setup) { + res = drvdata->data->setup(pdev, drvdata->base); + if (res) + goto fail1; + } + + drvdata->sb_irq = platform_get_irq(pdev, 0); + res = devm_request_irq(&pdev->dev, drvdata->sb_irq, + altr_edac_device_handler, + 0, dev_name(&pdev->dev), dci); + if (res) + goto fail1; + + drvdata->db_irq = platform_get_irq(pdev, 1); + res = devm_request_irq(&pdev->dev, drvdata->db_irq, + altr_edac_device_handler, + 0, dev_name(&pdev->dev), dci); + if (res) + goto fail1; + + dci->mod_name = "Altera ECC Manager"; + dci->dev_name = drvdata->edac_dev_name; + + res = edac_device_add_device(dci); + if (res) + goto fail1; + + altr_create_edacdev_dbgfs(dci, drvdata->data); + + devres_close_group(&pdev->dev, NULL); + + return 0; + +fail1: + edac_device_free_ctl_info(dci); +fail: + devres_release_group(&pdev->dev, NULL); + edac_printk(KERN_ERR, EDAC_DEVICE, + "%s:Error setting up EDAC device: %d\n", ecc_name, res); + + return res; +} + +static int altr_edac_device_remove(struct platform_device *pdev) +{ + struct edac_device_ctl_info *dci = platform_get_drvdata(pdev); + struct altr_edac_device_dev *drvdata = dci->pvt_info; + + debugfs_remove_recursive(drvdata->debugfs_dir); + edac_device_del_device(&pdev->dev); + edac_device_free_ctl_info(dci); + + return 0; +} + +static struct platform_driver altr_edac_device_driver = { + .probe = altr_edac_device_probe, + .remove = altr_edac_device_remove, + .driver = { + .name = "altr_edac_device", + .of_match_table = altr_edac_device_of_match, + }, +}; +module_platform_driver(altr_edac_device_driver); + +/*********************** OCRAM EDAC Device Functions *********************/ + +#ifdef CONFIG_EDAC_ALTERA_OCRAM + +static void *ocram_alloc_mem(size_t size, void **other) +{ + struct device_node *np; + struct gen_pool *gp; + void *sram_addr; + + np = of_find_compatible_node(NULL, NULL, "altr,socfpga-ocram-ecc"); + if (!np) + return NULL; + + gp = of_gen_pool_get(np, "iram", 0); + of_node_put(np); + if (!gp) + return NULL; + + sram_addr = (void *)gen_pool_alloc(gp, size); + if (!sram_addr) + return NULL; + + memset(sram_addr, 0, size); + /* Ensure data is written out */ + wmb(); + + /* Remember this handle for freeing later */ + *other = gp; + + return sram_addr; +} + +static void ocram_free_mem(void *p, size_t size, void *other) +{ + gen_pool_free((struct gen_pool *)other, (u32)p, size); +} + +/* + * altr_ocram_check_deps() + * Test for OCRAM cache ECC dependencies upon entry because + * platform specific startup should have initialized the + * On-Chip RAM memory and enabled the ECC. + * Can't turn on ECC here because accessing un-initialized + * memory will cause CE/UE errors possibly causing an ABORT. + */ +static int altr_ocram_check_deps(struct platform_device *pdev, + void __iomem *base) +{ + if (readl(base) & ALTR_OCR_ECC_EN) + return 0; + + edac_printk(KERN_ERR, EDAC_DEVICE, + "OCRAM: No ECC present or ECC disabled.\n"); + return -ENODEV; +} + +const struct edac_device_prv_data ocramecc_data = { + .setup = altr_ocram_check_deps, + .ce_clear_mask = (ALTR_OCR_ECC_EN | ALTR_OCR_ECC_SERR), + .ue_clear_mask = (ALTR_OCR_ECC_EN | ALTR_OCR_ECC_DERR), + .dbgfs_name = "altr_ocram_trigger", + .alloc_mem = ocram_alloc_mem, + .free_mem = ocram_free_mem, + .ecc_enable_mask = ALTR_OCR_ECC_EN, + .ce_set_mask = (ALTR_OCR_ECC_EN | ALTR_OCR_ECC_INJS), + .ue_set_mask = (ALTR_OCR_ECC_EN | ALTR_OCR_ECC_INJD), + .trig_alloc_sz = ALTR_TRIG_OCRAM_BYTE_SIZE, +}; + +#endif /* CONFIG_EDAC_ALTERA_OCRAM */ + +/********************* L2 Cache EDAC Device Functions ********************/ + +#ifdef CONFIG_EDAC_ALTERA_L2C + +static void *l2_alloc_mem(size_t size, void **other) +{ + struct device *dev = *other; + void *ptemp = devm_kzalloc(dev, size, GFP_KERNEL); + + if (!ptemp) + return NULL; + + /* Make sure everything is written out */ + wmb(); + + /* + * Clean all cache levels up to LoC (includes L2) + * This ensures the corrupted data is written into + * L2 cache for readback test (which causes ECC error). + */ + flush_cache_all(); + + return ptemp; +} + +static void l2_free_mem(void *p, size_t size, void *other) +{ + struct device *dev = other; + + if (dev && p) + devm_kfree(dev, p); +} + +/* + * altr_l2_check_deps() + * Test for L2 cache ECC dependencies upon entry because + * platform specific startup should have initialized the L2 + * memory and enabled the ECC. + * Bail if ECC is not enabled. + * Note that L2 Cache Enable is forced at build time. + */ +static int altr_l2_check_deps(struct platform_device *pdev, + void __iomem *base) +{ + if (readl(base) & ALTR_L2_ECC_EN) + return 0; + + edac_printk(KERN_ERR, EDAC_DEVICE, + "L2: No ECC present, or ECC disabled\n"); + return -ENODEV; +} + +const struct edac_device_prv_data l2ecc_data = { + .setup = altr_l2_check_deps, + .ce_clear_mask = 0, + .ue_clear_mask = 0, + .dbgfs_name = "altr_l2_trigger", + .alloc_mem = l2_alloc_mem, + .free_mem = l2_free_mem, + .ecc_enable_mask = ALTR_L2_ECC_EN, + .ce_set_mask = (ALTR_L2_ECC_EN | ALTR_L2_ECC_INJS), + .ue_set_mask = (ALTR_L2_ECC_EN | ALTR_L2_ECC_INJD), + .trig_alloc_sz = ALTR_TRIG_L2C_BYTE_SIZE, +}; + +#endif /* CONFIG_EDAC_ALTERA_L2C */ + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Thor Thayer"); -MODULE_DESCRIPTION("EDAC Driver for Altera SDRAM Controller"); +MODULE_DESCRIPTION("EDAC Driver for Altera Memories"); -- cgit v1.2.3-58-ga151 From 83bdaad4d919722744ef1b726a9913ec36c6a430 Mon Sep 17 00:00:00 2001 From: Hubert Chrzaniuk Date: Mon, 7 Mar 2016 15:30:45 +0100 Subject: EDAC, sb_edac: Fix logic when computing DIMM sizes on Xeon Phi Correct a typo introduced by d0cdf9003140 ("EDAC, sb_edac: Add Knights Landing (Xeon Phi gen 2) support") As a result under some configurations DIMMs were not correctly recognized. Problem affects only Xeon Phi architecture. Signed-off-by: Hubert Chrzaniuk Acked-by: Aristeu Rozanski Cc: Mauro Carvalho Chehab Cc: linux-edac Cc: lukasz.anaczkowski@intel.com Link: http://lkml.kernel.org/r/1457361045-26221-1-git-send-email-hubert.chrzaniuk@intel.com Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/edac') diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index e438ee5b433f..f5c6b97c8958 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1574,7 +1574,7 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes) for (cha = 0; cha < KNL_MAX_CHAS; cha++) { if (knl_get_mc_route(target, mc_route_reg[cha]) == channel - && participants[channel]) { + && !participants[channel]) { participant_count++; participants[channel] = 1; break; -- cgit v1.2.3-58-ga151 From be0aec23bf4624fd55650629fe8df20483487049 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 7 Mar 2016 14:02:18 +0100 Subject: x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors For Scalable MCA enabled processors, errors are listed per IP block. And since it is not required for an IP to map to a particular bank, we need to use HWID and McaType values from the MCx_IPID register to figure out which IP a given bank represents. We also have a new bit (TCC) in the MCx_STATUS register to indicate Task context is corrupt. Add logic here to decode errors from all known IP blocks for Fam17h Model 00-0fh and to print TCC errors. [ Minor fixups. ] Signed-off-by: Aravind Gopalakrishnan Signed-off-by: Borislav Petkov Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1457021458-2522-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 59 ++++++ arch/x86/kernel/cpu/mcheck/mce_amd.c | 29 +++ drivers/edac/mce_amd.c | 335 ++++++++++++++++++++++++++++++++++- 3 files changed, 420 insertions(+), 3 deletions(-) (limited to 'drivers/edac') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 80ba0a8d6d06..9c467fe00551 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -42,6 +42,18 @@ /* AMD-specific bits */ #define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ +#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ + +/* + * McaX field if set indicates a given bank supports MCA extensions: + * - Deferred error interrupt type is specifiable by bank. + * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers, + * But should not be used to determine MSR numbers. + * - TCC bit is present in MCx_STATUS. + */ +#define MCI_CONFIG_MCAX 0x1 +#define MCI_IPID_MCATYPE 0xFFFF0000 +#define MCI_IPID_HWID 0xFFF /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is @@ -93,7 +105,9 @@ /* AMD Scalable MCA */ #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 +#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) /* * This structure contains all data related to the MCE log. Also @@ -291,4 +305,49 @@ struct cper_sec_mem_err; extern void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err); +/* + * Enumerate new IP types and HWID values in AMD processors which support + * Scalable MCA. + */ +#ifdef CONFIG_X86_MCE_AMD +enum amd_ip_types { + SMCA_F17H_CORE = 0, /* Core errors */ + SMCA_DF, /* Data Fabric */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ + SMCA_SMU, /* System Management Unit */ + N_AMD_IP_TYPES +}; + +struct amd_hwid { + const char *name; + unsigned int hwid; +}; + +extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES]; + +enum amd_core_mca_blocks { + SMCA_LS = 0, /* Load Store */ + SMCA_IF, /* Instruction Fetch */ + SMCA_L2_CACHE, /* L2 cache */ + SMCA_DE, /* Decoder unit */ + RES, /* Reserved */ + SMCA_EX, /* Execution unit */ + SMCA_FP, /* Floating Point */ + SMCA_L3_CACHE, /* L3 cache */ + N_CORE_MCA_BLOCKS +}; + +extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS]; + +enum amd_df_mca_blocks { + SMCA_CS = 0, /* Coherent Slave */ + SMCA_PIE, /* Power management, Interrupts, etc */ + N_DF_BLOCKS +}; + +extern const char * const amd_df_mcablock_names[N_DF_BLOCKS]; +#endif + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 88de27bd5797..ee487a93ebe7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -71,6 +71,35 @@ static const char * const th_names[] = { "execution_unit", }; +/* Define HWID to IP type mappings for Scalable MCA */ +struct amd_hwid amd_hwids[] = { + [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, + [SMCA_DF] = { "data_fabric", 0x2E }, + [SMCA_UMC] = { "umc", 0x96 }, + [SMCA_PB] = { "param_block", 0x5 }, + [SMCA_PSP] = { "psp", 0xFF }, + [SMCA_SMU] = { "smu", 0x1 }, +}; +EXPORT_SYMBOL_GPL(amd_hwids); + +const char * const amd_core_mcablock_names[] = { + [SMCA_LS] = "load_store", + [SMCA_IF] = "insn_fetch", + [SMCA_L2_CACHE] = "l2_cache", + [SMCA_DE] = "decode_unit", + [RES] = "", + [SMCA_EX] = "execution_unit", + [SMCA_FP] = "floating_point", + [SMCA_L3_CACHE] = "l3_cache", +}; +EXPORT_SYMBOL_GPL(amd_core_mcablock_names); + +const char * const amd_df_mcablock_names[] = { + [SMCA_CS] = "coherent_slave", + [SMCA_PIE] = "pie", +}; +EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e3a945ce374b..49768c08ac07 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = { "Status Register File", }; +/* Scalable MCA error strings */ +static const char * const f17h_ls_mce_desc[] = { + "Load queue parity", + "Store queue parity", + "Miss address buffer payload parity", + "L1 TLB parity", + "", /* reserved */ + "DC tag error type 6", + "DC tag error type 1", + "Internal error type 1", + "Internal error type 2", + "Sys Read data error thread 0", + "Sys read data error thread 1", + "DC tag error type 2", + "DC data error type 1 (poison comsumption)", + "DC data error type 2", + "DC data error type 3", + "DC tag error type 4", + "L2 TLB parity", + "PDC parity error", + "DC tag error type 3", + "DC tag error type 5", + "L2 fill data error", +}; + +static const char * const f17h_if_mce_desc[] = { + "microtag probe port parity error", + "IC microtag or full tag multi-hit error", + "IC full tag parity", + "IC data array parity", + "Decoupling queue phys addr parity error", + "L0 ITLB parity error", + "L1 ITLB parity error", + "L2 ITLB parity error", + "BPQ snoop parity on Thread 0", + "BPQ snoop parity on Thread 1", + "L1 BTB multi-match error", + "L2 BTB multi-match error", +}; + +static const char * const f17h_l2_mce_desc[] = { + "L2M tag multi-way-hit error", + "L2M tag ECC error", + "L2M data ECC error", + "HW assert", +}; + +static const char * const f17h_de_mce_desc[] = { + "uop cache tag parity error", + "uop cache data parity error", + "Insn buffer parity error", + "Insn dispatch queue parity error", + "Fetch address FIFO parity", + "Patch RAM data parity", + "Patch RAM sequencer parity", + "uop buffer parity" +}; + +static const char * const f17h_ex_mce_desc[] = { + "Watchdog timeout error", + "Phy register file parity", + "Flag register file parity", + "Immediate displacement register file parity", + "Address generator payload parity", + "EX payload parity", + "Checkpoint queue parity", + "Retire dispatch queue parity", +}; + +static const char * const f17h_fp_mce_desc[] = { + "Physical register file parity", + "Freelist parity error", + "Schedule queue parity", + "NSQ parity error", + "Retire queue parity", + "Status register file parity", +}; + +static const char * const f17h_l3_mce_desc[] = { + "Shadow tag macro ECC error", + "Shadow tag macro multi-way-hit error", + "L3M tag ECC error", + "L3M tag multi-way-hit error", + "L3M data ECC error", + "XI parity, L3 fill done channel error", + "L3 victim queue parity", + "L3 HW assert", +}; + +static const char * const f17h_cs_mce_desc[] = { + "Illegal request from transport layer", + "Address violation", + "Security violation", + "Illegal response from transport layer", + "Unexpected response", + "Parity error on incoming request or probe response data", + "Parity error on incoming read response data", + "Atomic request parity", + "ECC error on probe filter access", +}; + +static const char * const f17h_pie_mce_desc[] = { + "HW assert", + "Internal PIE register security violation", + "Error on GMI link", + "Poison data written to internal PIE register", +}; + +static const char * const f17h_umc_mce_desc[] = { + "DRAM ECC error", + "Data poison error on DRAM", + "SDP parity error", + "Advanced peripheral bus error", + "Command/address parity error", + "Write data CRC error", +}; + +static const char * const f17h_pb_mce_desc[] = { + "Parameter Block RAM ECC error", +}; + +static const char * const f17h_psp_mce_desc[] = { + "PSP RAM ECC or parity error", +}; + +static const char * const f17h_smu_mce_desc[] = { + "SMU RAM ECC or parity error", +}; + static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m) pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } +static void decode_f17h_core_errors(const char *ip_name, u8 xec, + unsigned int mca_type) +{ + const char * const *error_desc_array; + size_t len; + + pr_emerg(HW_ERR "%s Error: ", ip_name); + + switch (mca_type) { + case SMCA_LS: + error_desc_array = f17h_ls_mce_desc; + len = ARRAY_SIZE(f17h_ls_mce_desc) - 1; + + if (xec == 0x4) { + pr_cont("Unrecognized LS MCA error code.\n"); + return; + } + break; + + case SMCA_IF: + error_desc_array = f17h_if_mce_desc; + len = ARRAY_SIZE(f17h_if_mce_desc) - 1; + break; + + case SMCA_L2_CACHE: + error_desc_array = f17h_l2_mce_desc; + len = ARRAY_SIZE(f17h_l2_mce_desc) - 1; + break; + + case SMCA_DE: + error_desc_array = f17h_de_mce_desc; + len = ARRAY_SIZE(f17h_de_mce_desc) - 1; + break; + + case SMCA_EX: + error_desc_array = f17h_ex_mce_desc; + len = ARRAY_SIZE(f17h_ex_mce_desc) - 1; + break; + + case SMCA_FP: + error_desc_array = f17h_fp_mce_desc; + len = ARRAY_SIZE(f17h_fp_mce_desc) - 1; + break; + + case SMCA_L3_CACHE: + error_desc_array = f17h_l3_mce_desc; + len = ARRAY_SIZE(f17h_l3_mce_desc) - 1; + break; + + default: + pr_cont("Corrupted MCA core error info.\n"); + return; + } + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", + amd_core_mcablock_names[mca_type]); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + +static void decode_df_errors(u8 xec, unsigned int mca_type) +{ + const char * const *error_desc_array; + size_t len; + + pr_emerg(HW_ERR "Data Fabric Error: "); + + switch (mca_type) { + case SMCA_CS: + error_desc_array = f17h_cs_mce_desc; + len = ARRAY_SIZE(f17h_cs_mce_desc) - 1; + break; + + case SMCA_PIE: + error_desc_array = f17h_pie_mce_desc; + len = ARRAY_SIZE(f17h_pie_mce_desc) - 1; + break; + + default: + pr_cont("Corrupted MCA Data Fabric info.\n"); + return; + } + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", + amd_df_mcablock_names[mca_type]); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + +/* Decode errors according to Scalable MCA specification */ +static void decode_smca_errors(struct mce *m) +{ + u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank); + unsigned int hwid, mca_type, i; + u8 xec = XEC(m->status, xec_mask); + const char * const *error_desc_array; + const char *ip_name; + u32 low, high; + size_t len; + + if (rdmsr_safe(addr, &low, &high)) { + pr_emerg("Invalid IP block specified, error information is unreliable.\n"); + return; + } + + hwid = high & MCI_IPID_HWID; + mca_type = (high & MCI_IPID_MCATYPE) >> 16; + + pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low); + + /* + * Based on hwid and mca_type values, decode errors from respective IPs. + * Note: mca_type values make sense only in the context of an hwid. + */ + for (i = 0; i < ARRAY_SIZE(amd_hwids); i++) + if (amd_hwids[i].hwid == hwid) + break; + + switch (i) { + case SMCA_F17H_CORE: + ip_name = (mca_type == SMCA_L3_CACHE) ? + "L3 Cache" : "F17h Core"; + return decode_f17h_core_errors(ip_name, xec, mca_type); + break; + + case SMCA_DF: + return decode_df_errors(xec, mca_type); + break; + + case SMCA_UMC: + error_desc_array = f17h_umc_mce_desc; + len = ARRAY_SIZE(f17h_umc_mce_desc) - 1; + break; + + case SMCA_PB: + error_desc_array = f17h_pb_mce_desc; + len = ARRAY_SIZE(f17h_pb_mce_desc) - 1; + break; + + case SMCA_PSP: + error_desc_array = f17h_psp_mce_desc; + len = ARRAY_SIZE(f17h_psp_mce_desc) - 1; + break; + + case SMCA_SMU: + error_desc_array = f17h_smu_mce_desc; + len = ARRAY_SIZE(f17h_smu_mce_desc) - 1; + break; + + default: + pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid); + return; + } + + ip_name = amd_hwids[i].name; + pr_emerg(HW_ERR "%s Error: ", ip_name); + + if (xec > len) { + pr_cont("Unrecognized %s MCA bank error code.\n", ip_name); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + static inline void amd_decode_err_code(u16 ec) { if (INT_ERROR(ec)) { @@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) struct mce *m = (struct mce *)data; struct cpuinfo_x86 *c = &cpu_data(m->extcpu); int ecc; + u32 ebx = cpuid_ebx(0x80000007); if (amd_filter_mce(m)) return NOTIFY_STOP; @@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - if (c->x86 == 0x15 || c->x86 == 0x16) + if (c->x86 >= 0x15) pr_cont("|%s|%s", ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); + if (!!(ebx & BIT(3))) { + u32 low, high; + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + + if (!rdmsr_safe(addr, &low, &high) && + (low & MCI_CONFIG_MCAX)) + pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + } + /* do the two bits[14:13] together */ ecc = (m->status >> 45) & 0x3; if (ecc) @@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_ADDRV) pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); + if (!!(ebx & BIT(3))) { + decode_smca_errors(m); + goto err_code; + } + if (!fam_ops) goto err_code; @@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = { static int __init mce_amd_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; + u32 ebx; if (c->x86_vendor != X86_VENDOR_AMD) return -ENODEV; @@ -888,10 +1204,18 @@ static int __init mce_amd_init(void) fam_ops->mc2_mce = f16h_mc2_mce; break; + case 0x17: + ebx = cpuid_ebx(0x80000007); + xec_mask = 0x3f; + if (!(ebx & BIT(3))) { + printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n"); + goto err_out; + } + break; + default: printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); - kfree(fam_ops); - fam_ops = NULL; + goto err_out; } pr_info("MCE: In-kernel MCE decoding enabled.\n"); @@ -899,6 +1223,11 @@ static int __init mce_amd_init(void) mce_register_decode_chain(&amd_mce_dec_nb); return 0; + +err_out: + kfree(fam_ops); + fam_ops = NULL; + return -EINVAL; } early_initcall(mce_amd_init); -- cgit v1.2.3-58-ga151 From eb1af3b71f9d83e45f2fd2fd649356e98e1c582c Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Wed, 9 Mar 2016 16:40:48 -0800 Subject: EDAC/sb_edac: Fix computation of channel address Large memory Haswell-EX systems with multiple DIMMs per channel were sometimes reporting the wrong DIMM. Found three problems: 1) Debug printouts for socket and channel interleave were not interpreting the register fields correctly. The socket interleave field is a 2^X value (0=1, 1=2, 2=4, 3=8). The channel interleave is X+1 (0=1, 1=2, 2=3. 3=4). 2) Actual use of the socket interleave value didn't interpret as 2^X 3) Conversion of address to channel address was complicated, and wrong. Signed-off-by: Tony Luck Acked-by: Aristeu Rozanski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-edac@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- drivers/edac/sb_edac.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'drivers/edac') diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index e438ee5b433f..34e54a313bf4 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1839,8 +1839,8 @@ static void get_memory_layout(const struct mem_ctl_info *mci) edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n", n_tads, gb, (mb*1000)/1024, ((u64)tmp_mb) << 20L, - (u32)TAD_SOCK(reg), - (u32)TAD_CH(reg), + (u32)(1 << TAD_SOCK(reg)), + (u32)TAD_CH(reg) + 1, (u32)TAD_TGT0(reg), (u32)TAD_TGT1(reg), (u32)TAD_TGT2(reg), @@ -2118,7 +2118,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, } ch_way = TAD_CH(reg) + 1; - sck_way = TAD_SOCK(reg) + 1; + sck_way = 1 << TAD_SOCK(reg); if (ch_way == 3) idx = addr >> 6; @@ -2175,7 +2175,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci, n_tads, addr, limit, - (u32)TAD_SOCK(reg), + sck_way, ch_way, offset, idx, @@ -2190,18 +2190,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci, offset, addr); return -EINVAL; } - addr -= offset; - /* Store the low bits [0:6] of the addr */ - ch_addr = addr & 0x7f; - /* Remove socket wayness and remove 6 bits */ - addr >>= 6; - addr = div_u64(addr, sck_xch); -#if 0 - /* Divide by channel way */ - addr = addr / ch_way; -#endif - /* Recover the last 6 bits */ - ch_addr |= addr << 6; + + ch_addr = addr - offset; + ch_addr >>= (6 + shiftup); + ch_addr /= ch_way * sck_way; + ch_addr <<= (6 + shiftup); + ch_addr |= addr & ((1 << (6 + shiftup)) - 1); /* * Step 3) Decode rank -- cgit v1.2.3-58-ga151