diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 532 |
1 files changed, 368 insertions, 164 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 016ea274b955..0e2ee5869b5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -25,10 +25,13 @@ #include <linux/list.h> #include <linux/module.h> #include <linux/uaccess.h> +#include <linux/reboot.h> +#include <linux/syscalls.h> #include "amdgpu.h" #include "amdgpu_ras.h" #include "amdgpu_atomfirmware.h" +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" const char *ras_error_string[] = { "none", @@ -65,11 +68,8 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) -static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, - uint64_t offset, uint64_t size, - struct amdgpu_bo **bo_ptr); -static int amdgpu_ras_release_vram(struct amdgpu_device *adev, - struct amdgpu_bo **bo_ptr); + +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) @@ -150,6 +150,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; + else if (sscanf(str, "reboot %32s", block_name) == 1) + op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -189,6 +191,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } + +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, + struct ras_common_if *head); + /** * DOC: AMDGPU RAS debugfs control interface * @@ -210,29 +216,36 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * * Second member: struct ras_debug_if::op. * It has three kinds of operations. - * 0: disable RAS on the block. Take ::head as its data. - * 1: enable RAS on the block. Take ::head as its data. - * 2: inject errors on the block. Take ::inject as its data. + * + * - 0: disable RAS on the block. Take ::head as its data. + * - 1: enable RAS on the block. Take ::head as its data. + * - 2: inject errors on the block. Take ::inject as its data. * * How to use the interface? * programs: * copy the struct ras_debug_if in your codes and initialize it. * write the struct to the control node. * - * bash: - * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl - * op: disable, enable, inject - * disable: only block is needed - * enable: block and error are needed - * inject: error, address, value are needed - * block: umc, smda, gfx, ......... - * see ras_block_string[] for details - * error: ue, ce - * ue: multi_uncorrectable - * ce: single_correctable - * sub_block: sub block index, pass 0 if there is no sub block + * .. code-block:: bash + * + * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl + * + * op: disable, enable, inject + * disable: only block is needed + * enable: block and error are needed + * inject: error, address, value are needed + * block: umc, smda, gfx, ......... + * see ras_block_string[] for details + * error: ue, ce + * ue: multi_uncorrectable + * ce: single_correctable + * sub_block: + * sub block index, pass 0 if there is no sub block + * + * here are some examples for bash commands: + * + * .. code-block:: bash * - * here are some examples for bash commands, * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl @@ -245,8 +258,9 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * For inject, please check corresponding err count at * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count * - * NOTE: operation is only allowed on blocks which are supported. - * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask + * .. note:: + * Operation is only allowed on blocks which are supported. + * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask */ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) @@ -279,6 +293,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; + case 3: + amdgpu_ras_get_context(adev)->reboot = true; + break; default: ret = -EINVAL; break; @@ -290,6 +307,33 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * return size; } +/** + * DOC: AMDGPU RAS debugfs EEPROM table reset interface + * + * Some boards contain an EEPROM which is used to persistently store a list of + * bad pages containing ECC errors detected in vram. This interface provides + * a way to reset the EEPROM, e.g., after testing error injection. + * + * Usage: + * + * .. code-block:: bash + * + * echo 1 > ../ras/ras_eeprom_reset + * + * will reset EEPROM table to 0 entries. + * + */ +static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; + int ret; + + ret = amdgpu_ras_eeprom_reset_table(&adev->psp.ras.ras->eeprom_control); + + return ret == 1 ? size : -EIO; +} + static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { .owner = THIS_MODULE, .read = NULL, @@ -297,6 +341,34 @@ static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { .llseek = default_llseek }; +static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { + .owner = THIS_MODULE, + .read = NULL, + .write = amdgpu_ras_debugfs_eeprom_write, + .llseek = default_llseek +}; + +/** + * DOC: AMDGPU RAS sysfs Error Count Interface + * + * It allows user to read the error count for each IP block on the gpu through + * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count + * + * It outputs the multiple lines which report the uncorrected (ue) and corrected + * (ce) error counts. + * + * The format of one line is below, + * + * [ce|ue]: count + * + * Example: + * + * .. code-block:: bash + * + * ue: 0 + * ce: 1 + * + */ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, struct device_attribute *attr, char *buf) { @@ -615,8 +687,12 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, adev->gfx.funcs->query_ras_error_count(adev, &err_data); break; case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub_funcs->query_ras_error_count) - adev->mmhub_funcs->query_ras_error_count(adev, &err_data); + if (adev->mmhub.funcs->query_ras_error_count) + adev->mmhub.funcs->query_ras_error_count(adev, &err_data); + break; + case AMDGPU_RAS_BLOCK__PCIE_BIF: + if (adev->nbio.funcs->query_ras_error_count) + adev->nbio.funcs->query_ras_error_count(adev, &err_data); break; default: break; @@ -628,12 +704,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, info->ue_count = obj->err_data.ue_count; info->ce_count = obj->err_data.ce_count; - if (err_data.ce_count) + if (err_data.ce_count) { dev_info(adev->dev, "%ld correctable errors detected in %s block\n", obj->err_data.ce_count, ras_block_str(info->head.block)); - if (err_data.ue_count) + } + if (err_data.ue_count) { dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n", obj->err_data.ue_count, ras_block_str(info->head.block)); + } return 0; } @@ -664,6 +742,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, break; case AMDGPU_RAS_BLOCK__UMC: case AMDGPU_RAS_BLOCK__MMHUB: + case AMDGPU_RAS_BLOCK__XGMI_WAFL: + case AMDGPU_RAS_BLOCK__PCIE_BIF: ret = psp_ras_trigger_error(&adev->psp, &block_info); break; default: @@ -733,8 +813,8 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) }; } -/* - * DOC: ras sysfs gpu_vram_bad_pages interface +/** + * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface * * It allows user to read the bad pages of vram on the gpu through * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages @@ -746,14 +826,21 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) * * gpu pfn and gpu page size are printed in hex format. * flags can be one of below character, + * * R: reserved, this gpu page is reserved and not able to use. + * * P: pending for reserve, this gpu page is marked as bad, will be reserved - * in next window of page_reserve. + * in next window of page_reserve. + * * F: unable to reserve. this gpu page can't be reserved due to some reasons. * - * examples: - * 0x00000001 : 0x00001000 : R - * 0x00000002 : 0x00001000 : P + * Examples: + * + * .. code-block:: bash + * + * 0x00000001 : 0x00001000 : R + * 0x00000002 : 0x00001000 : P + * */ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, @@ -934,8 +1021,10 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) struct drm_minor *minor = adev->ddev->primary; con->dir = debugfs_create_dir("ras", minor->debugfs_root); - con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, - adev, &amdgpu_ras_debugfs_ctrl_ops); + debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_ctrl_ops); + debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_eeprom_ops); } void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, @@ -980,10 +1069,8 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) amdgpu_ras_debugfs_remove(adev, &obj->head); } - debugfs_remove(con->ent); - debugfs_remove(con->dir); + debugfs_remove_recursive(con->dir); con->dir = NULL; - con->ent = NULL; } /* debugfs end */ @@ -1188,14 +1275,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, for (; i < data->count; i++) { (*bps)[i] = (struct ras_badpage){ - .bp = data->bps[i].bp, + .bp = data->bps[i].retired_page, .size = AMDGPU_GPU_PAGE_SIZE, .flags = 0, }; if (data->last_reserved <= i) (*bps)[i].flags = 1; - else if (data->bps[i].bo == NULL) + else if (data->bps_bo[i] == NULL) (*bps)[i].flags = 2; } @@ -1214,105 +1301,46 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) atomic_set(&ras->in_recovery, 0); } -static int amdgpu_ras_release_vram(struct amdgpu_device *adev, - struct amdgpu_bo **bo_ptr) -{ - /* no need to free it actually. */ - amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); - return 0; -} - -/* reserve vram with size@offset */ -static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, - uint64_t offset, uint64_t size, - struct amdgpu_bo **bo_ptr) -{ - struct ttm_operation_ctx ctx = { false, false }; - struct amdgpu_bo_param bp; - int r = 0; - int i; - struct amdgpu_bo *bo; - - if (bo_ptr) - *bo_ptr = NULL; - memset(&bp, 0, sizeof(bp)); - bp.size = size; - bp.byte_align = PAGE_SIZE; - bp.domain = AMDGPU_GEM_DOMAIN_VRAM; - bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | - AMDGPU_GEM_CREATE_NO_CPU_ACCESS; - bp.type = ttm_bo_type_kernel; - bp.resv = NULL; - - r = amdgpu_bo_create(adev, &bp, &bo); - if (r) - return -EINVAL; - - r = amdgpu_bo_reserve(bo, false); - if (r) - goto error_reserve; - - offset = ALIGN(offset, PAGE_SIZE); - for (i = 0; i < bo->placement.num_placement; ++i) { - bo->placements[i].fpfn = offset >> PAGE_SHIFT; - bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; - } - - ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); - r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); - if (r) - goto error_pin; - - r = amdgpu_bo_pin_restricted(bo, - AMDGPU_GEM_DOMAIN_VRAM, - offset, - offset + size); - if (r) - goto error_pin; - - if (bo_ptr) - *bo_ptr = bo; - - amdgpu_bo_unreserve(bo); - return r; - -error_pin: - amdgpu_bo_unreserve(bo); -error_reserve: - amdgpu_bo_unref(&bo); - return r; -} - /* alloc/realloc bps array */ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, struct ras_err_handler_data *data, int pages) { unsigned int old_space = data->count + data->space_left; unsigned int new_space = old_space + pages; - unsigned int align_space = ALIGN(new_space, 1024); - void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); - - if (!tmp) + unsigned int align_space = ALIGN(new_space, 512); + void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); + struct amdgpu_bo **bps_bo = + kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL); + + if (!bps || !bps_bo) { + kfree(bps); + kfree(bps_bo); return -ENOMEM; + } if (data->bps) { - memcpy(tmp, data->bps, + memcpy(bps, data->bps, data->count * sizeof(*data->bps)); kfree(data->bps); } + if (data->bps_bo) { + memcpy(bps_bo, data->bps_bo, + data->count * sizeof(*data->bps_bo)); + kfree(data->bps_bo); + } - data->bps = tmp; + data->bps = bps; + data->bps_bo = bps_bo; data->space_left += align_space - old_space; return 0; } /* it deal with vram only. */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, - unsigned long *bps, int pages) + struct eeprom_table_record *bps, int pages) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; - int i = pages; int ret = 0; if (!con || !con->eh_data || !bps || pages <= 0) @@ -1329,24 +1357,87 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, goto out; } - while (i--) - data->bps[data->count++].bp = bps[i]; - + memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); + data->count += pages; data->space_left -= pages; + out: mutex_unlock(&con->recovery_lock); return ret; } +/* + * write error record array to eeprom, the function should be + * protected by recovery_lock + */ +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data; + struct amdgpu_ras_eeprom_control *control; + int save_count; + + if (!con || !con->eh_data) + return 0; + + control = &con->eeprom_control; + data = con->eh_data; + save_count = data->count - control->num_recs; + /* only new entries are saved */ + if (save_count > 0) + if (amdgpu_ras_eeprom_process_recods(control, + &data->bps[control->num_recs], + true, + save_count)) { + DRM_ERROR("Failed to save EEPROM table data!"); + return -EIO; + } + + return 0; +} + +/* + * read error record array in eeprom and reserve enough space for + * storing new bad pages + */ +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) +{ + struct amdgpu_ras_eeprom_control *control = + &adev->psp.ras.ras->eeprom_control; + struct eeprom_table_record *bps = NULL; + int ret = 0; + + /* no bad page record, skip eeprom access */ + if (!control->num_recs) + return ret; + + bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); + if (!bps) + return -ENOMEM; + + if (amdgpu_ras_eeprom_process_recods(control, bps, false, + control->num_recs)) { + DRM_ERROR("Failed to load EEPROM table records!"); + ret = -EIO; + goto out; + } + + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); + +out: + kfree(bps); + return ret; +} + /* called in gpu recovery/init */ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; uint64_t bp; - struct amdgpu_bo *bo; - int i; + struct amdgpu_bo *bo = NULL; + int i, ret = 0; if (!con || !con->eh_data) return 0; @@ -1357,18 +1448,29 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) goto out; /* reserve vram at driver post stage. */ for (i = data->last_reserved; i < data->count; i++) { - bp = data->bps[i].bp; + bp = data->bps[i].retired_page; - if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, - PAGE_SIZE, &bo)) - DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); + /* There are two cases of reserve error should be ignored: + * 1) a ras bad page has been allocated (used by someone); + * 2) a ras bad page has been reserved (duplicate error injection + * for one page); + */ + if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, + AMDGPU_GPU_PAGE_SIZE, + AMDGPU_GEM_DOMAIN_VRAM, + &bo, NULL)) + DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp); - data->bps[i].bo = bo; + data->bps_bo[i] = bo; data->last_reserved = i + 1; + bo = NULL; } + + /* continue to save bad pages to eeprom even reesrve_vram fails */ + ret = amdgpu_ras_save_bad_pages(adev); out: mutex_unlock(&con->recovery_lock); - return 0; + return ret; } /* called when driver unload */ @@ -1388,11 +1490,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) goto out; for (i = data->last_reserved - 1; i >= 0; i--) { - bo = data->bps[i].bo; + bo = data->bps_bo[i]; - amdgpu_ras_release_vram(adev, &bo); + amdgpu_bo_free_kernel(&bo, NULL, NULL); - data->bps[i].bo = bo; + data->bps_bo[i] = bo; data->last_reserved = i; } out: @@ -1400,41 +1502,54 @@ out: return 0; } -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{ - /* TODO - * write the array to eeprom when SMU disabled. - */ - return 0; -} - -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{ - /* TODO - * read the array to eeprom when SMU disabled. - */ - return 0; -} - -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) +int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data **data = &con->eh_data; + struct ras_err_handler_data **data; + int ret; - *data = kmalloc(sizeof(**data), - GFP_KERNEL|__GFP_ZERO); - if (!*data) - return -ENOMEM; + if (con) + data = &con->eh_data; + else + return 0; + + *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); + if (!*data) { + ret = -ENOMEM; + goto out; + } mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); con->adev = adev; - amdgpu_ras_load_bad_pages(adev); - amdgpu_ras_reserve_bad_pages(adev); + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); + if (ret) + goto free; + + if (con->eeprom_control.num_recs) { + ret = amdgpu_ras_load_bad_pages(adev); + if (ret) + goto free; + ret = amdgpu_ras_reserve_bad_pages(adev); + if (ret) + goto release; + } return 0; + +release: + amdgpu_ras_release_bad_pages(adev); +free: + kfree((*data)->bps); + kfree((*data)->bps_bo); + kfree(*data); + con->eh_data = NULL; +out: + DRM_WARN("Failed to initialize ras recovery!\n"); + + return ret; } static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@ -1442,13 +1557,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data = con->eh_data; + /* recovery_init failed to init it, fini is useless */ + if (!data) + return 0; + cancel_work_sync(&con->recovery_work); - amdgpu_ras_save_bad_pages(adev); amdgpu_ras_release_bad_pages(adev); mutex_lock(&con->recovery_lock); con->eh_data = NULL; kfree(data->bps); + kfree(data->bps_bo); kfree(data); mutex_unlock(&con->recovery_lock); @@ -1500,6 +1619,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int r; if (con) return 0; @@ -1527,31 +1647,106 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* Might need get this flag from vbios. */ con->flags = RAS_DEFAULT_FLAGS; - if (amdgpu_ras_recovery_init(adev)) - goto recovery_out; + if (adev->nbio.funcs->init_ras_controller_interrupt) { + r = adev->nbio.funcs->init_ras_controller_interrupt(adev); + if (r) + return r; + } + + if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { + r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); + if (r) + return r; + } amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; if (amdgpu_ras_fs_init(adev)) goto fs_out; - /* ras init for each ras block */ - if (adev->umc.funcs->ras_init) - adev->umc.funcs->ras_init(adev); - DRM_INFO("RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", con->hw_supported, con->supported); return 0; fs_out: - amdgpu_ras_recovery_fini(adev); -recovery_out: amdgpu_ras_set_context(adev, NULL); kfree(con); return -EINVAL; } +/* helper function to handle common stuff in ip late init phase */ +int amdgpu_ras_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_fs_if *fs_info, + struct ras_ih_if *ih_info) +{ + int r; + + /* disable RAS feature per IP block if it is not supported */ + if (!amdgpu_ras_is_supported(adev, ras_block->block)) { + amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); + return 0; + } + + r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); + if (r) { + if (r == -EAGAIN) { + /* request gpu reset. will run again */ + amdgpu_ras_request_reset_on_boot(adev, + ras_block->block); + return 0; + } else if (adev->in_suspend || adev->in_gpu_reset) { + /* in resume phase, if fail to enable ras, + * clean up all ras fs nodes, and disable ras */ + goto cleanup; + } else + return r; + } + + /* in resume phase, no need to create ras fs node */ + if (adev->in_suspend || adev->in_gpu_reset) + return 0; + + if (ih_info->cb) { + r = amdgpu_ras_interrupt_add_handler(adev, ih_info); + if (r) + goto interrupt; + } + + amdgpu_ras_debugfs_create(adev, fs_info); + + r = amdgpu_ras_sysfs_create(adev, fs_info); + if (r) + goto sysfs; + + return 0; +cleanup: + amdgpu_ras_sysfs_remove(adev, ras_block); +sysfs: + amdgpu_ras_debugfs_remove(adev, ras_block); + if (ih_info->cb) + amdgpu_ras_interrupt_remove_handler(adev, ih_info); +interrupt: + amdgpu_ras_feature_enable(adev, ras_block, 0); + return r; +} + +/* helper function to remove ras fs node and interrupt handler */ +void amdgpu_ras_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_ih_if *ih_info) +{ + if (!ras_block || !ih_info) + return; + + amdgpu_ras_sysfs_remove(adev, ras_block); + amdgpu_ras_debugfs_remove(adev, ras_block); + if (ih_info->cb) + amdgpu_ras_interrupt_remove_handler(adev, ih_info); + amdgpu_ras_feature_enable(adev, ras_block, 0); +} + /* do some init work after IP late init as dependence. * and it runs in resume/gpu reset/booting up cases. */ @@ -1645,3 +1840,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) return 0; } + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) +{ + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); + + amdgpu_ras_reset_gpu(adev, false); + } +} |