diff options
author | Muralidhara M K <muralidhara.mk@amd.com> | 2023-11-02 11:42:25 +0000 |
---|---|---|
committer | Borislav Petkov (AMD) <bp@alien8.de> | 2023-11-29 11:21:05 +0100 |
commit | 12f230c07a95d925d6af754485952515e7975127 (patch) | |
tree | d10657dc6780da1eaafc487c9b912123ed6d770b /drivers/edac | |
parent | 9a5f580c1c71b6aedba696c4898a7a7184cef8ad (diff) |
EDAC/amd64: Add support for family 0x19, models 0x90-9f devices
AMD Models 90h-9fh are APUs. They have built-in HBM3 memory. ECC support
is enabled by default.
APU models have a single Data Fabric (DF) per Package. Each DF is
visible to the OS in the same way as chiplet-based systems like Zen2
CPUs and later. However, the Unified Memory Controllers (UMCs) are
arranged in the same way as GPU-based MI200 devices rather than
CPU-based systems.
Use the existing gpu_ops for hetergeneous systems to support enumeration
of nodes and memory topology with few fixups.
[ bp: Massage comments. ]
Signed-off-by: Muralidhara M K <muralidhara.mk@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20231102114225.2006878-5-muralimk@amd.com
Diffstat (limited to 'drivers/edac')
-rw-r--r-- | drivers/edac/amd64_edac.c | 66 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.h | 1 |
2 files changed, 49 insertions, 18 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 9b6642d00871..537b9987a431 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -996,15 +996,23 @@ static struct local_node_map { #define LNTM_NODE_COUNT GENMASK(27, 16) #define LNTM_BASE_NODE_ID GENMASK(11, 0) -static int gpu_get_node_map(void) +static int gpu_get_node_map(struct amd64_pvt *pvt) { struct pci_dev *pdev; int ret; u32 tmp; /* - * Node ID 0 is reserved for CPUs. - * Therefore, a non-zero Node ID means we've already cached the values. + * Mapping of nodes from hardware-provided AMD Node ID to a + * Linux logical one is applicable for MI200 models. Therefore, + * return early for other heterogeneous systems. + */ + if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3) + return 0; + + /* + * Node ID 0 is reserved for CPUs. Therefore, a non-zero Node ID + * means the values have been already cached. */ if (gpu_node_map.base_node_id) return 0; @@ -3851,7 +3859,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci) dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs); dimm->edac_mode = EDAC_SECDED; - dimm->mtype = MEM_HBM2; + dimm->mtype = pvt->dram_type; dimm->dtype = DEV_X16; dimm->grain = 64; } @@ -3880,7 +3888,7 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt) return true; } -static inline u32 gpu_get_umc_base(u8 umc, u8 channel) +static inline u32 gpu_get_umc_base(struct amd64_pvt *pvt, u8 umc, u8 channel) { /* * On CPUs, there is one channel per UMC, so UMC numbering equals @@ -3893,13 +3901,16 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel) * On GPU nodes channels are selected in 3rd nibble * HBM chX[3:0]= [Y ]5X[3:0]000; * HBM chX[7:4]= [Y+1]5X[3:0]000 + * + * On MI300 APU nodes, same as GPU nodes but channels are selected + * in the base address of 0x90000 */ umc *= 2; if (channel >= 4) umc++; - return 0x50000 + (umc << 20) + ((channel % 4) << 12); + return pvt->gpu_umc_base + (umc << 20) + ((channel % 4) << 12); } static void gpu_read_mc_regs(struct amd64_pvt *pvt) @@ -3910,7 +3921,7 @@ static void gpu_read_mc_regs(struct amd64_pvt *pvt) /* Read registers from each UMC */ for_each_umc(i) { - umc_base = gpu_get_umc_base(i, 0); + umc_base = gpu_get_umc_base(pvt, i, 0); umc = &pvt->umc[i]; amd_smn_read(nid, umc_base + UMCCH_UMC_CFG, &umc->umc_cfg); @@ -3927,7 +3938,7 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt) for_each_umc(umc) { for_each_chip_select(cs, umc, pvt) { - base_reg = gpu_get_umc_base(umc, cs) + UMCCH_BASE_ADDR; + base_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_BASE_ADDR; base = &pvt->csels[umc].csbases[cs]; if (!amd_smn_read(pvt->mc_node_id, base_reg, base)) { @@ -3935,7 +3946,7 @@ static void gpu_read_base_mask(struct amd64_pvt *pvt) umc, cs, *base, base_reg); } - mask_reg = gpu_get_umc_base(umc, cs) + UMCCH_ADDR_MASK; + mask_reg = gpu_get_umc_base(pvt, umc, cs) + UMCCH_ADDR_MASK; mask = &pvt->csels[umc].csmasks[cs]; if (!amd_smn_read(pvt->mc_node_id, mask_reg, mask)) { @@ -3960,7 +3971,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt) { int ret; - ret = gpu_get_node_map(); + ret = gpu_get_node_map(pvt); if (ret) return ret; @@ -4125,6 +4136,8 @@ static int per_family_init(struct amd64_pvt *pvt) if (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) { pvt->ctl_name = "MI200"; pvt->max_mcs = 4; + pvt->dram_type = MEM_HBM2; + pvt->gpu_umc_base = 0x50000; pvt->ops = &gpu_ops; } else { pvt->ctl_name = "F19h_M30h"; @@ -4142,6 +4155,13 @@ static int per_family_init(struct amd64_pvt *pvt) pvt->ctl_name = "F19h_M70h"; pvt->flags.zn_regs_v2 = 1; break; + case 0x90 ... 0x9f: + pvt->ctl_name = "F19h_M90h"; + pvt->max_mcs = 4; + pvt->dram_type = MEM_HBM3; + pvt->gpu_umc_base = 0x90000; + pvt->ops = &gpu_ops; + break; case 0xa0 ... 0xaf: pvt->ctl_name = "F19h_MA0h"; pvt->max_mcs = 12; @@ -4180,23 +4200,33 @@ static const struct attribute_group *amd64_edac_attr_groups[] = { NULL }; +/* + * For heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers + * should be swapped to fit into the layers. + */ +static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer) +{ + bool is_gpu = (pvt->ops == &gpu_ops); + + if (!layer) + return is_gpu ? pvt->max_mcs + : pvt->csels[0].b_cnt; + else + return is_gpu ? pvt->csels[0].b_cnt + : pvt->max_mcs; +} + static int init_one_instance(struct amd64_pvt *pvt) { struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; int ret = -ENOMEM; - /* - * For Heterogeneous family EDAC CHIP_SELECT and CHANNEL layers should - * be swapped to fit into the layers. - */ layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; - layers[0].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ? - pvt->max_mcs : pvt->csels[0].b_cnt; + layers[0].size = get_layer_size(pvt, 0); layers[0].is_virt_csrow = true; layers[1].type = EDAC_MC_LAYER_CHANNEL; - layers[1].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ? - pvt->csels[0].b_cnt : pvt->max_mcs; + layers[1].size = get_layer_size(pvt, 1); layers[1].is_virt_csrow = false; mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0); diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 5a4e4a59682b..1665f7932bac 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -362,6 +362,7 @@ struct amd64_pvt { u32 dct_sel_lo; /* DRAM Controller Select Low */ u32 dct_sel_hi; /* DRAM Controller Select High */ u32 online_spare; /* On-Line spare Reg */ + u32 gpu_umc_base; /* Base address used for channel selection on GPUs */ /* x4, x8, or x16 syndromes in use */ u8 ecc_sym_sz; |