diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-16 03:49:24 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-16 03:49:24 -0800 |
commit | ba54ff1fb662215de683777f815b9e96276d55cf (patch) | |
tree | d9ad29a17d91fafd76c0b16b41dd30445e50215c /drivers/misc | |
parent | dd6f9b17cd7af68b6a5090deedf1f5e84f66f4e6 (diff) | |
parent | f361c96c75184d0272572087c7d9874e0f64b870 (diff) |
Merge tag 'char-misc-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc
Pull char/misc driver updates from Greg KH:
"Here is the large set of char/misc and other driver subsystem changes
for 6.2-rc1. Nothing earth-shattering in here at all, just a lot of
new driver development and minor fixes.
Highlights include:
- fastrpc driver updates
- iio new drivers and updates
- habanalabs driver updates for new hardware and features
- slimbus driver updates
- speakup module parameters added to aid in boot time configuration
- i2c probe_new conversions for lots of different drivers
- other small driver fixes and additions
One semi-interesting change in here is the increase of the number of
misc dynamic minors available to 1048448 to handle new huge-cpu
systems.
All of these have been in linux-next for a while with no reported
problems"
* tag 'char-misc-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc: (521 commits)
extcon: usbc-tusb320: Convert to i2c's .probe_new()
extcon: rt8973: Convert to i2c's .probe_new()
extcon: fsa9480: Convert to i2c's .probe_new()
extcon: max77843: Replace irqchip mask_invert with unmask_base
chardev: fix error handling in cdev_device_add()
mcb: mcb-parse: fix error handing in chameleon_parse_gdd()
drivers: mcb: fix resource leak in mcb_probe()
coresight: etm4x: fix repeated words in comments
coresight: cti: Fix null pointer error on CTI init before ETM
coresight: trbe: remove cpuhp instance node before remove cpuhp state
counter: stm32-lptimer-cnt: fix the check on arr and cmp registers update
misc: fastrpc: Add dma_mask to fastrpc_channel_ctx
misc: fastrpc: Add mmap request assigning for static PD pool
misc: fastrpc: Safekeep mmaps on interrupted invoke
misc: fastrpc: Add support for audiopd
misc: fastrpc: Rework fastrpc_req_munmap
misc: fastrpc: Use fastrpc_map_put in fastrpc_map_create on fail
misc: fastrpc: Add fastrpc_remote_heap_alloc
misc: fastrpc: Add reserved mem support
misc: fastrpc: Rename audio protection domain to root
...
Diffstat (limited to 'drivers/misc')
52 files changed, 2269 insertions, 626 deletions
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 358ad56f6524..9947b7892bd5 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -176,6 +176,28 @@ config SGI_XP this feature will allow for direct communication between SSIs based on a network adapter and DMA messaging. +config SMPRO_ERRMON + tristate "Ampere Computing SMPro error monitor driver" + depends on MFD_SMPRO || COMPILE_TEST + help + Say Y here to get support for the SMpro error monitor function + provided by Ampere Computing's Altra and Altra Max SoCs. Upon + loading, the driver creates sysfs files which can be use to gather + multiple HW error data reported via read and write system calls. + + To compile this driver as a module, say M here. The driver will be + called smpro-errmon. + +config SMPRO_MISC + tristate "Ampere Computing SMPro miscellaneous driver" + depends on MFD_SMPRO || COMPILE_TEST + help + Say Y here to get support for the SMpro error miscellalenous function + provided by Ampere Computing's Altra and Altra Max SoCs. + + To compile this driver as a module, say M here. The driver will be + called smpro-misc. + config CS5535_MFGPT tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support" depends on MFD_CS5535 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index ac9b3e757ba1..87b54a4a4422 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -23,6 +23,8 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o obj-$(CONFIG_KGDB_TESTS) += kgdbts.o obj-$(CONFIG_SGI_XP) += sgi-xp/ obj-$(CONFIG_SGI_GRU) += sgi-gru/ +obj-$(CONFIG_SMPRO_ERRMON) += smpro-errmon.o +obj-$(CONFIG_SMPRO_MISC) += smpro-misc.o obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o obj-$(CONFIG_GEHC_ACHC) += gehc-achc.o obj-$(CONFIG_HP_ILO) += hpilo.o diff --git a/drivers/misc/apds9802als.c b/drivers/misc/apds9802als.c index a32431f4b370..0526c55d5cd5 100644 --- a/drivers/misc/apds9802als.c +++ b/drivers/misc/apds9802als.c @@ -212,8 +212,7 @@ static int als_set_default_config(struct i2c_client *client) return ret_val; } -static int apds9802als_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int apds9802als_probe(struct i2c_client *client) { int res; struct als_data *data; @@ -297,7 +296,7 @@ static struct i2c_driver apds9802als_driver = { .name = DRIVER_NAME, .pm = APDS9802ALS_PM_OPS, }, - .probe = apds9802als_probe, + .probe_new = apds9802als_probe, .remove = apds9802als_remove, .id_table = apds9802als_id, }; diff --git a/drivers/misc/apds990x.c b/drivers/misc/apds990x.c index e2100cc42ce8..0024503ea6db 100644 --- a/drivers/misc/apds990x.c +++ b/drivers/misc/apds990x.c @@ -1051,8 +1051,7 @@ static const struct attribute_group apds990x_attribute_group[] = { {.attrs = sysfs_attrs_ctrl }, }; -static int apds990x_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int apds990x_probe(struct i2c_client *client) { struct apds990x_chip *chip; int err; @@ -1272,7 +1271,7 @@ static struct i2c_driver apds990x_driver = { .name = "apds990x", .pm = &apds990x_pm_ops, }, - .probe = apds990x_probe, + .probe_new = apds990x_probe, .remove = apds990x_remove, .id_table = apds990x_id, }; diff --git a/drivers/misc/bh1770glc.c b/drivers/misc/bh1770glc.c index d0dfa674414c..bedbe0efb330 100644 --- a/drivers/misc/bh1770glc.c +++ b/drivers/misc/bh1770glc.c @@ -1162,8 +1162,7 @@ static const struct attribute_group bh1770_attribute_group = { .attrs = sysfs_attrs }; -static int bh1770_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int bh1770_probe(struct i2c_client *client) { struct bh1770_chip *chip; int err; @@ -1379,7 +1378,7 @@ static struct i2c_driver bh1770_driver = { .name = "bh1770glc", .pm = &bh1770_pm_ops, }, - .probe = bh1770_probe, + .probe_new = bh1770_probe, .remove = bh1770_remove, .id_table = bh1770_id, }; diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 375f692ae9d6..fb95a2d5cef4 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -965,10 +965,10 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n * if it returns an error! */ if ((rc = cxl_register_afu(afu))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_afu_add(afu))) - goto err_put1; + goto err_del_dev; /* * pHyp doesn't expose the programming models supported by the @@ -984,7 +984,7 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n afu->modes_supported = CXL_MODE_DIRECTED; if ((rc = cxl_afu_select_best_mode(afu))) - goto err_put2; + goto err_remove_sysfs; adapter->afu[afu->slice] = afu; @@ -1004,10 +1004,12 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n return 0; -err_put2: +err_remove_sysfs: cxl_sysfs_afu_remove(afu); -err_put1: - device_unregister(&afu->dev); +err_del_dev: + device_del(&afu->dev); +err_put_dev: + put_device(&afu->dev); free = false; guest_release_serr_irq(afu); err2: @@ -1141,18 +1143,20 @@ struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_devic * even if it returns an error! */ if ((rc = cxl_register_adapter(adapter))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_adapter_add(adapter))) - goto err_put1; + goto err_del_dev; /* release the context lock as the adapter is configured */ cxl_adapter_context_unlock(adapter); return adapter; -err_put1: - device_unregister(&adapter->dev); +err_del_dev: + device_del(&adapter->dev); +err_put_dev: + put_device(&adapter->dev); free = false; cxl_guest_remove_chardev(adapter); err1: diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 3de0aea62ade..6d495d641c95 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1164,10 +1164,10 @@ static int pci_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev) * if it returns an error! */ if ((rc = cxl_register_afu(afu))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_afu_add(afu))) - goto err_put1; + goto err_del_dev; adapter->afu[afu->slice] = afu; @@ -1176,10 +1176,12 @@ static int pci_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev) return 0; -err_put1: +err_del_dev: + device_del(&afu->dev); +err_put_dev: pci_deconfigure_afu(afu); cxl_debugfs_afu_remove(afu); - device_unregister(&afu->dev); + put_device(&afu->dev); return rc; err_free_native: @@ -1667,23 +1669,25 @@ static struct cxl *cxl_pci_init_adapter(struct pci_dev *dev) * even if it returns an error! */ if ((rc = cxl_register_adapter(adapter))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_adapter_add(adapter))) - goto err_put1; + goto err_del_dev; /* Release the context lock as adapter is configured */ cxl_adapter_context_unlock(adapter); return adapter; -err_put1: +err_del_dev: + device_del(&adapter->dev); +err_put_dev: /* This should mirror cxl_remove_adapter, except without the * sysfs parts */ cxl_debugfs_adapter_remove(adapter); cxl_deconfigure_adapter(adapter); - device_unregister(&adapter->dev); + put_device(&adapter->dev); return ERR_PTR(rc); err_release: diff --git a/drivers/misc/ds1682.c b/drivers/misc/ds1682.c index 0698ddc5f4d5..d517eed32971 100644 --- a/drivers/misc/ds1682.c +++ b/drivers/misc/ds1682.c @@ -200,8 +200,7 @@ static const struct bin_attribute ds1682_eeprom_attr = { /* * Called when a ds1682 device is matched with this driver */ -static int ds1682_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int ds1682_probe(struct i2c_client *client) { int rc; @@ -251,7 +250,7 @@ static struct i2c_driver ds1682_driver = { .name = "ds1682", .of_match_table = ds1682_of_match, }, - .probe = ds1682_probe, + .probe_new = ds1682_probe, .remove = ds1682_remove, .id_table = ds1682_id, }; diff --git a/drivers/misc/eeprom/eeprom.c b/drivers/misc/eeprom/eeprom.c index 8a841a75d893..32611100d5cd 100644 --- a/drivers/misc/eeprom/eeprom.c +++ b/drivers/misc/eeprom/eeprom.c @@ -141,8 +141,7 @@ static int eeprom_detect(struct i2c_client *client, struct i2c_board_info *info) return 0; } -static int eeprom_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int eeprom_probe(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; struct eeprom_data *data; @@ -197,7 +196,7 @@ static struct i2c_driver eeprom_driver = { .driver = { .name = "eeprom", }, - .probe = eeprom_probe, + .probe_new = eeprom_probe, .remove = eeprom_remove, .id_table = eeprom_id, diff --git a/drivers/misc/eeprom/idt_89hpesx.c b/drivers/misc/eeprom/idt_89hpesx.c index bb3ed352b95f..4e07ee9cb500 100644 --- a/drivers/misc/eeprom/idt_89hpesx.c +++ b/drivers/misc/eeprom/idt_89hpesx.c @@ -1366,7 +1366,7 @@ static void idt_remove_dbgfs_files(struct idt_89hpesx_dev *pdev) /* * idt_probe() - IDT 89HPESx driver probe() callback method */ -static int idt_probe(struct i2c_client *client, const struct i2c_device_id *id) +static int idt_probe(struct i2c_client *client) { struct idt_89hpesx_dev *pdev; int ret; @@ -1556,7 +1556,7 @@ static struct i2c_driver idt_driver = { .name = IDT_NAME, .of_match_table = idt_of_match, }, - .probe = idt_probe, + .probe_new = idt_probe, .remove = idt_remove, .id_table = idt_ids, }; diff --git a/drivers/misc/eeprom/max6875.c b/drivers/misc/eeprom/max6875.c index 6bd4f4339af4..79cf8afcef2e 100644 --- a/drivers/misc/eeprom/max6875.c +++ b/drivers/misc/eeprom/max6875.c @@ -130,8 +130,7 @@ static const struct bin_attribute user_eeprom_attr = { .read = max6875_read, }; -static int max6875_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int max6875_probe(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; struct max6875_data *data; @@ -193,7 +192,7 @@ static struct i2c_driver max6875_driver = { .driver = { .name = "max6875", }, - .probe = max6875_probe, + .probe_new = max6875_probe, .remove = max6875_remove, .id_table = max6875_id, }; diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 0f467a71b069..c9902a1dcf5d 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/qcom_scm.h> #include <uapi/misc/fastrpc.h> +#include <linux/of_reserved_mem.h> #define ADSP_DOMAIN_ID (0) #define MDSP_DOMAIN_ID (1) @@ -37,8 +38,20 @@ #define FASTRPC_DSP_UTILITIES_HANDLE 2 #define FASTRPC_CTXID_MASK (0xFF0) #define INIT_FILELEN_MAX (2 * 1024 * 1024) +#define INIT_FILE_NAMELEN_MAX (128) #define FASTRPC_DEVICE_NAME "fastrpc" + +/* Add memory to static PD pool, protection thru XPU */ +#define ADSP_MMAP_HEAP_ADDR 4 +/* MAP static DMA buffer on DSP User PD */ +#define ADSP_MMAP_DMA_BUFFER 6 +/* Add memory to static PD pool protection thru hypervisor */ +#define ADSP_MMAP_REMOTE_HEAP_ADDR 8 +/* Add memory to userPD pool, for user heap */ #define ADSP_MMAP_ADD_PAGES 0x1000 +/* Add memory to userPD pool, for LLC heap */ +#define ADSP_MMAP_ADD_PAGES_LLC 0x3000, + #define DSP_UNSUPPORTED_API (0x80000414) /* MAX NUMBER of DSP ATTRIBUTES SUPPORTED */ #define FASTRPC_MAX_DSP_ATTRIBUTES (256) @@ -72,6 +85,7 @@ FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0) #define FASTRPC_CREATE_PROCESS_NARGS 6 +#define FASTRPC_CREATE_STATIC_PROCESS_NARGS 3 /* Remote Method id table */ #define FASTRPC_RMID_INIT_ATTACH 0 #define FASTRPC_RMID_INIT_RELEASE 1 @@ -84,7 +98,7 @@ #define FASTRPC_RMID_INIT_MEM_UNMAP 11 /* Protection Domain(PD) ids */ -#define AUDIO_PD (0) /* also GUEST_OS PD? */ +#define ROOT_PD (0) #define USER_PD (1) #define SENSORS_PD (2) @@ -261,8 +275,11 @@ struct fastrpc_channel_ctx { u32 dsp_attributes[FASTRPC_MAX_DSP_ATTRIBUTES]; struct fastrpc_device *secure_fdevice; struct fastrpc_device *fdevice; + struct fastrpc_buf *remote_heap; + struct list_head invoke_interrupted_mmaps; bool secure; bool unsigned_support; + u64 dma_mask; }; struct fastrpc_device { @@ -369,7 +386,7 @@ static void fastrpc_buf_free(struct fastrpc_buf *buf) kfree(buf); } -static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, +static int __fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, u64 size, struct fastrpc_buf **obuf) { struct fastrpc_buf *buf; @@ -397,14 +414,37 @@ static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, return -ENOMEM; } + *obuf = buf; + + return 0; +} + +static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, + u64 size, struct fastrpc_buf **obuf) +{ + int ret; + struct fastrpc_buf *buf; + + ret = __fastrpc_buf_alloc(fl, dev, size, obuf); + if (ret) + return ret; + + buf = *obuf; + if (fl->sctx && fl->sctx->sid) buf->phys += ((u64)fl->sctx->sid << 32); - *obuf = buf; - return 0; } +static int fastrpc_remote_heap_alloc(struct fastrpc_user *fl, struct device *dev, + u64 size, struct fastrpc_buf **obuf) +{ + struct device *rdev = &fl->cctx->rpdev->dev; + + return __fastrpc_buf_alloc(fl, rdev, size, obuf); +} + static void fastrpc_channel_ctx_free(struct kref *ref) { struct fastrpc_channel_ctx *cctx; @@ -714,6 +754,8 @@ static int fastrpc_map_create(struct fastrpc_user *fl, int fd, return -ENOMEM; INIT_LIST_HEAD(&map->node); + kref_init(&map->refcount); + map->fl = fl; map->fd = fd; map->buf = dma_buf_get(fd); @@ -740,7 +782,6 @@ static int fastrpc_map_create(struct fastrpc_user *fl, int fd, map->size = len; map->va = sg_virt(map->table->sgl); map->len = len; - kref_init(&map->refcount); if (attr & FASTRPC_ATTR_SECUREMAP) { /* @@ -770,7 +811,7 @@ map_err: attach_err: dma_buf_put(map->buf); get_err: - kfree(map); + fastrpc_map_put(map); return err; } @@ -1073,6 +1114,8 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel, struct fastrpc_invoke_args *args) { struct fastrpc_invoke_ctx *ctx = NULL; + struct fastrpc_buf *buf, *b; + int err = 0; if (!fl->sctx) @@ -1135,6 +1178,14 @@ bail: spin_unlock(&fl->lock); fastrpc_context_put(ctx); } + + if (err == -ERESTARTSYS) { + list_for_each_entry_safe(buf, b, &fl->mmaps, node) { + list_del(&buf->node); + list_add_tail(&buf->node, &fl->cctx->invoke_interrupted_mmaps); + } + } + if (err) dev_dbg(fl->sctx->dev, "Error: Invoke Failed %d\n", err); @@ -1159,6 +1210,120 @@ static bool is_session_rejected(struct fastrpc_user *fl, bool unsigned_pd_reques return false; } +static int fastrpc_init_create_static_process(struct fastrpc_user *fl, + char __user *argp) +{ + struct fastrpc_init_create_static init; + struct fastrpc_invoke_args *args; + struct fastrpc_phy_page pages[1]; + char *name; + int err; + struct { + int pgid; + u32 namelen; + u32 pageslen; + } inbuf; + u32 sc; + + args = kcalloc(FASTRPC_CREATE_STATIC_PROCESS_NARGS, sizeof(*args), GFP_KERNEL); + if (!args) + return -ENOMEM; + + if (copy_from_user(&init, argp, sizeof(init))) { + err = -EFAULT; + goto err; + } + + if (init.namelen > INIT_FILE_NAMELEN_MAX) { + err = -EINVAL; + goto err; + } + + name = kzalloc(init.namelen, GFP_KERNEL); + if (!name) { + err = -ENOMEM; + goto err; + } + + if (copy_from_user(name, (void __user *)(uintptr_t)init.name, init.namelen)) { + err = -EFAULT; + goto err_name; + } + + if (!fl->cctx->remote_heap) { + err = fastrpc_remote_heap_alloc(fl, fl->sctx->dev, init.memlen, + &fl->cctx->remote_heap); + if (err) + goto err_name; + + /* Map if we have any heap VMIDs associated with this ADSP Static Process. */ + if (fl->cctx->vmcount) { + unsigned int perms = BIT(QCOM_SCM_VMID_HLOS); + + err = qcom_scm_assign_mem(fl->cctx->remote_heap->phys, + (u64)fl->cctx->remote_heap->size, &perms, + fl->cctx->vmperms, fl->cctx->vmcount); + if (err) { + dev_err(fl->sctx->dev, "Failed to assign memory with phys 0x%llx size 0x%llx err %d", + fl->cctx->remote_heap->phys, fl->cctx->remote_heap->size, err); + goto err_map; + } + } + } + + inbuf.pgid = fl->tgid; + inbuf.namelen = init.namelen; + inbuf.pageslen = 0; + fl->pd = USER_PD; + + args[0].ptr = (u64)(uintptr_t)&inbuf; + args[0].length = sizeof(inbuf); + args[0].fd = -1; + + args[1].ptr = (u64)(uintptr_t)name; + args[1].length = inbuf.namelen; + args[1].fd = -1; + + pages[0].addr = fl->cctx->remote_heap->phys; + pages[0].size = fl->cctx->remote_heap->size; + + args[2].ptr = (u64)(uintptr_t) pages; + args[2].length = sizeof(*pages); + args[2].fd = -1; + + sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_CREATE_STATIC, 3, 0); + + err = fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE, + sc, args); + if (err) + goto err_invoke; + + kfree(args); + + return 0; +err_invoke: + if (fl->cctx->vmcount) { + struct qcom_scm_vmperm perm; + + perm.vmid = QCOM_SCM_VMID_HLOS; + perm.perm = QCOM_SCM_PERM_RWX; + err = qcom_scm_assign_mem(fl->cctx->remote_heap->phys, + (u64)fl->cctx->remote_heap->size, + &(fl->cctx->vmperms[0].vmid), &perm, 1); + if (err) + dev_err(fl->sctx->dev, "Failed to assign memory phys 0x%llx size 0x%llx err %d", + fl->cctx->remote_heap->phys, fl->cctx->remote_heap->size, err); + } +err_map: + fastrpc_buf_free(fl->cctx->remote_heap); +err_name: + kfree(name); +err: + kfree(args); + + return err; +} + static int fastrpc_init_create_process(struct fastrpc_user *fl, char __user *argp) { @@ -1605,30 +1770,14 @@ static int fastrpc_get_dsp_info(struct fastrpc_user *fl, char __user *argp) return 0; } -static int fastrpc_req_munmap_impl(struct fastrpc_user *fl, - struct fastrpc_req_munmap *req) +static int fastrpc_req_munmap_impl(struct fastrpc_user *fl, struct fastrpc_buf *buf) { struct fastrpc_invoke_args args[1] = { [0] = { 0 } }; - struct fastrpc_buf *buf = NULL, *iter, *b; struct fastrpc_munmap_req_msg req_msg; struct device *dev = fl->sctx->dev; int err; u32 sc; - spin_lock(&fl->lock); - list_for_each_entry_safe(iter, b, &fl->mmaps, node) { - if ((iter->raddr == req->vaddrout) && (iter->size == req->size)) { - buf = iter; - break; - } - } - spin_unlock(&fl->lock); - - if (!buf) { - dev_err(dev, "mmap not in list\n"); - return -EINVAL; - } - req_msg.pgid = fl->tgid; req_msg.size = buf->size; req_msg.vaddr = buf->raddr; @@ -1654,12 +1803,29 @@ static int fastrpc_req_munmap_impl(struct fastrpc_user *fl, static int fastrpc_req_munmap(struct fastrpc_user *fl, char __user *argp) { + struct fastrpc_buf *buf = NULL, *iter, *b; struct fastrpc_req_munmap req; + struct device *dev = fl->sctx->dev; if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; - return fastrpc_req_munmap_impl(fl, &req); + spin_lock(&fl->lock); + list_for_each_entry_safe(iter, b, &fl->mmaps, node) { + if ((iter->raddr == req.vaddrout) && (iter->size == req.size)) { + buf = iter; + break; + } + } + spin_unlock(&fl->lock); + + if (!buf) { + dev_err(dev, "mmap\t\tpt 0x%09llx [len 0x%08llx] not in list\n", + req.vaddrout, req.size); + return -EINVAL; + } + + return fastrpc_req_munmap_impl(fl, buf); } static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) @@ -1668,7 +1834,6 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) struct fastrpc_buf *buf = NULL; struct fastrpc_mmap_req_msg req_msg; struct fastrpc_mmap_rsp_msg rsp_msg; - struct fastrpc_req_munmap req_unmap; struct fastrpc_phy_page pages; struct fastrpc_req_mmap req; struct device *dev = fl->sctx->dev; @@ -1678,8 +1843,9 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; - if (req.flags != ADSP_MMAP_ADD_PAGES) { + if (req.flags != ADSP_MMAP_ADD_PAGES && req.flags != ADSP_MMAP_REMOTE_HEAP_ADDR) { dev_err(dev, "flag not supported 0x%x\n", req.flags); + return -EINVAL; } @@ -1725,16 +1891,29 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) /* let the client know the address to use */ req.vaddrout = rsp_msg.vaddr; + /* Add memory to static PD pool, protection thru hypervisor */ + if (req.flags != ADSP_MMAP_REMOTE_HEAP_ADDR && fl->cctx->vmcount) { + struct qcom_scm_vmperm perm; + int err = 0; + + perm.vmid = QCOM_SCM_VMID_HLOS; + perm.perm = QCOM_SCM_PERM_RWX; + err = qcom_scm_assign_mem(buf->phys, buf->size, + &(fl->cctx->vmperms[0].vmid), &perm, 1); + if (err) { + dev_err(fl->sctx->dev, "Failed to assign memory phys 0x%llx size 0x%llx err %d", + buf->phys, buf->size, err); + goto err_assign; + } + } + spin_lock(&fl->lock); list_add_tail(&buf->node, &fl->mmaps); spin_unlock(&fl->lock); if (copy_to_user((void __user *)argp, &req, sizeof(req))) { - /* unmap the memory and release the buffer */ - req_unmap.vaddrout = buf->raddr; - req_unmap.size = buf->size; - fastrpc_req_munmap_impl(fl, &req_unmap); - return -EFAULT; + err = -EFAULT; + goto err_assign; } dev_dbg(dev, "mmap\t\tpt 0x%09lx OK [len 0x%08llx]\n", @@ -1742,6 +1921,8 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) return 0; +err_assign: + fastrpc_req_munmap_impl(fl, buf); err_invoke: fastrpc_buf_free(buf); @@ -1889,11 +2070,14 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, err = fastrpc_invoke(fl, argp); break; case FASTRPC_IOCTL_INIT_ATTACH: - err = fastrpc_init_attach(fl, AUDIO_PD); + err = fastrpc_init_attach(fl, ROOT_PD); break; case FASTRPC_IOCTL_INIT_ATTACH_SNS: err = fastrpc_init_attach(fl, SENSORS_PD); break; + case FASTRPC_IOCTL_INIT_CREATE_STATIC: + err = fastrpc_init_create_static_process(fl, argp); + break; case FASTRPC_IOCTL_INIT_CREATE: err = fastrpc_init_create_process(fl, argp); break; @@ -2068,6 +2252,9 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev) return -EINVAL; } + if (of_reserved_mem_device_init_by_idx(rdev, rdev->of_node, 0)) + dev_info(rdev, "no reserved DMA memory for FASTRPC\n"); + vmcount = of_property_read_variable_u32_array(rdev->of_node, "qcom,vmids", &vmids[0], 0, FASTRPC_MAX_VMIDS); if (vmcount < 0) @@ -2120,8 +2307,10 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev) kref_init(&data->refcount); dev_set_drvdata(&rpdev->dev, data); + rdev->dma_mask = &data->dma_mask; dma_set_mask_and_coherent(rdev, DMA_BIT_MASK(32)); INIT_LIST_HEAD(&data->users); + INIT_LIST_HEAD(&data->invoke_interrupted_mmaps); spin_lock_init(&data->lock); idr_init(&data->ctx_idr); data->domain_id = domain_id; @@ -2146,6 +2335,7 @@ static void fastrpc_notify_users(struct fastrpc_user *user) static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev) { struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev); + struct fastrpc_buf *buf, *b; struct fastrpc_user *user; unsigned long flags; @@ -2160,6 +2350,12 @@ static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev) if (cctx->secure_fdevice) misc_deregister(&cctx->secure_fdevice->miscdev); + list_for_each_entry_safe(buf, b, &cctx->invoke_interrupted_mmaps, node) + list_del(&buf->node); + + if (cctx->remote_heap) + fastrpc_buf_free(cctx->remote_heap); + of_platform_depopulate(&rpdev->dev); cctx->rpdev = NULL; diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c index 693981891870..bae8114f2805 100644 --- a/drivers/misc/genwqe/card_base.c +++ b/drivers/misc/genwqe/card_base.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * IBM Accelerator Family 'GenWQE' * * (C) Copyright IBM Corp. 2013 diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index fa05770865c6..ea0e5101c10e 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref) */ if (hl_cs_cmpl->encaps_signals) kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount, - hl_encaps_handle_do_release); + hl_encaps_release_handle_and_put_ctx); } - if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) - && cs->encaps_signals) - kref_put(&cs->encaps_sig_hdl->refcount, - hl_encaps_handle_do_release); + if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals) + kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx); out: /* Must be called before hl_ctx_put because inside we use ctx to get @@ -798,7 +796,7 @@ out: static void cs_timedout(struct work_struct *work) { struct hl_device *hdev; - u64 event_mask; + u64 event_mask = 0x0; int rc; struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); @@ -830,11 +828,7 @@ static void cs_timedout(struct work_struct *work) if (rc) { hdev->captured_err_info.cs_timeout.timestamp = ktime_get(); hdev->captured_err_info.cs_timeout.seq = cs->sequence; - - event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT | - HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT; - - hl_notifier_event_send_all(hdev, event_mask); + event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT; } switch (cs->type) { @@ -869,8 +863,12 @@ static void cs_timedout(struct work_struct *work) cs_put(cs); - if (device_reset) - hl_device_reset(hdev, HL_DRV_RESET_TDR); + if (device_reset) { + event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask); + } else if (event_mask) { + hl_notifier_event_send_all(hdev, event_mask); + } } static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, @@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs) hl_complete_job(hdev, job); } +/* + * release_reserved_encaps_signals() - release reserved encapsulated signals. + * @hdev: pointer to habanalabs device structure + * + * Release reserved encapsulated signals which weren't un-reserved, or for which a CS with + * encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back. + * For these signals need also to put the refcount of the H/W SOB which was taken at the + * reservation. + */ +static void release_reserved_encaps_signals(struct hl_device *hdev) +{ + struct hl_ctx *ctx = hl_get_compute_ctx(hdev); + struct hl_cs_encaps_sig_handle *handle; + struct hl_encaps_signals_mgr *mgr; + u32 id; + + if (!ctx) + return; + + mgr = &ctx->sig_mgr; + + idr_for_each_entry(&mgr->handles, handle, id) + if (handle->cs_seq == ULLONG_MAX) + kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx); + + hl_ctx_put(ctx); +} + void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush) { int i; @@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush) } force_complete_multi_cs(hdev); + + release_reserved_encaps_signals(hdev); } static void @@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv, */ handle->pre_sob_val = prop->next_sob_val - handle->count; + handle->cs_seq = ULLONG_MAX; + *signals_count = prop->next_sob_val; hdev->asic_funcs->hw_queues_unlock(hdev); @@ -2350,10 +2380,8 @@ put_cs: /* We finished with the CS in this function, so put the ref */ cs_put(cs); free_cs_chunk_array: - if (!wait_cs_submitted && cs_encaps_signals && handle_found && - is_wait_cs) - kref_put(&encaps_sig_hdl->refcount, - hl_encaps_handle_do_release); + if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs) + kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx); kfree(cs_chunk_array); out: return rc; diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 2f4620b7990c..9c8b1b37b510 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -9,38 +9,46 @@ #include <linux/slab.h> -void hl_encaps_handle_do_release(struct kref *ref) +static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob, + bool put_ctx) { - struct hl_cs_encaps_sig_handle *handle = - container_of(ref, struct hl_cs_encaps_sig_handle, refcount); struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; + if (put_hw_sob) + hw_sob_put(handle->hw_sob); + spin_lock(&mgr->lock); idr_remove(&mgr->handles, handle->id); spin_unlock(&mgr->lock); - hl_ctx_put(handle->ctx); + if (put_ctx) + hl_ctx_put(handle->ctx); + kfree(handle); } -static void hl_encaps_handle_do_release_sob(struct kref *ref) +void hl_encaps_release_handle_and_put_ctx(struct kref *ref) { struct hl_cs_encaps_sig_handle *handle = - container_of(ref, struct hl_cs_encaps_sig_handle, refcount); - struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); - /* if we're here, then there was a signals reservation but cs with - * encaps signals wasn't submitted, so need to put refcount - * to hw_sob taken at the reservation. - */ - hw_sob_put(handle->hw_sob); + encaps_handle_do_release(handle, false, true); +} - spin_lock(&mgr->lock); - idr_remove(&mgr->handles, handle->id); - spin_unlock(&mgr->lock); +static void hl_encaps_release_handle_and_put_sob(struct kref *ref) +{ + struct hl_cs_encaps_sig_handle *handle = + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); - hl_ctx_put(handle->ctx); - kfree(handle); + encaps_handle_do_release(handle, true, false); +} + +void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref) +{ + struct hl_cs_encaps_sig_handle *handle = + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); + + encaps_handle_do_release(handle, true, true); } static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr) @@ -49,8 +57,7 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr) idr_init(&mgr->handles); } -static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, - struct hl_encaps_signals_mgr *mgr) +static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr) { struct hl_cs_encaps_sig_handle *handle; struct idr *idp; @@ -58,11 +65,14 @@ static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, idp = &mgr->handles; + /* The IDR is expected to be empty at this stage, because any left signal should have been + * released as part of CS roll-back. + */ if (!idr_is_empty(idp)) { - dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n"); + dev_warn(hdev->dev, + "device released while some encaps signals handles are still allocated\n"); idr_for_each_entry(idp, handle, id) - kref_put(&handle->refcount, - hl_encaps_handle_do_release_sob); + kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob); } idr_destroy(&mgr->handles); diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 48d3ec8b5c82..945c0e6758ca 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -1769,6 +1769,11 @@ void hl_debugfs_add_device(struct hl_device *hdev) dev_entry, &hl_timeout_locked_fops); + debugfs_create_u32("device_release_watchdog_timeout", + 0644, + dev_entry->root, + &hdev->device_release_watchdog_timeout_sec); + for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { debugfs_create_file(hl_debugfs_list[i].name, 0444, diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 233d8b46c831..87ab329e65d4 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -12,10 +12,13 @@ #include <linux/pci.h> #include <linux/hwmon.h> +#include <linux/vmalloc.h> #include <trace/events/habanalabs.h> -#define HL_RESET_DELAY_USEC 10000 /* 10ms */ +#define HL_RESET_DELAY_USEC 10000 /* 10ms */ + +#define HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC 5 enum dma_alloc_type { DMA_ALLOC_COHERENT, @@ -31,6 +34,7 @@ enum dma_alloc_type { * @hdev: pointer to habanalabs device structure. * @addr: the address the caller wants to access. * @region: the PCI region. + * @new_bar_region_base: the new BAR region base address. * * @return: the old BAR base address on success, U64_MAX for failure. * The caller should set it back to the old address after use. @@ -40,7 +44,8 @@ enum dma_alloc_type { * This function can be called also if the bar doesn't need to be set, * in that case it just won't change the base. */ -static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region) +static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region, + u64 *new_bar_region_base) { struct asic_fixed_properties *prop = &hdev->asic_prop; u64 bar_base_addr, old_base; @@ -54,27 +59,28 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr); /* in case of success we need to update the new BAR base */ - if (old_base != U64_MAX) - region->region_base = bar_base_addr; + if ((old_base != U64_MAX) && new_bar_region_base) + *new_bar_region_base = bar_base_addr; return old_base; } -static int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, - enum debugfs_access_type acc_type, enum pci_region region_type) +int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type, enum pci_region region_type, bool set_dram_bar) { struct pci_mem_region *region = &hdev->pci_mem_region[region_type]; + u64 old_base = 0, rc, bar_region_base = region->region_base; void __iomem *acc_addr; - u64 old_base = 0, rc; - if (region_type == PCI_REGION_DRAM) { - old_base = hl_set_dram_bar(hdev, addr, region); + if (set_dram_bar) { + old_base = hl_set_dram_bar(hdev, addr, region, &bar_region_base); if (old_base == U64_MAX) return -EIO; } - acc_addr = hdev->pcie_bar[region->bar_id] + addr - region->region_base + - region->offset_in_bar; + acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + + (addr - bar_region_base); + switch (acc_type) { case DEBUGFS_READ8: *val = readb(acc_addr); @@ -96,8 +102,8 @@ static int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val break; } - if (region_type == PCI_REGION_DRAM) { - rc = hl_set_dram_bar(hdev, old_base, region); + if (set_dram_bar) { + rc = hl_set_dram_bar(hdev, old_base, region, NULL); if (rc == U64_MAX) return -EIO; } @@ -134,6 +140,9 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c dma_addr_t dma_handle, enum dma_alloc_type alloc_type, const char *caller) { + /* this is needed to avoid warning on using freed pointer */ + u64 store_cpu_addr = (u64) (uintptr_t) cpu_addr; + switch (alloc_type) { case DMA_ALLOC_COHERENT: hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); @@ -146,7 +155,7 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c break; } - trace_habanalabs_dma_free(hdev->dev, (u64) (uintptr_t) cpu_addr, dma_handle, size, caller); + trace_habanalabs_dma_free(hdev->dev, store_cpu_addr, dma_handle, size, caller); } void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, @@ -279,7 +288,7 @@ int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, case PCI_REGION_SRAM: case PCI_REGION_DRAM: return hl_access_sram_dram_region(hdev, addr, val, acc_type, - region_type); + region_type, (region_type == PCI_REGION_DRAM)); default: return -EFAULT; } @@ -355,10 +364,49 @@ bool hl_device_operational(struct hl_device *hdev, } } +bool hl_ctrl_device_operational(struct hl_device *hdev, + enum hl_device_status *status) +{ + enum hl_device_status current_status; + + current_status = hl_device_status(hdev); + if (status) + *status = current_status; + + switch (current_status) { + case HL_DEVICE_STATUS_MALFUNCTION: + return false; + case HL_DEVICE_STATUS_IN_RESET: + case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE: + case HL_DEVICE_STATUS_NEEDS_RESET: + case HL_DEVICE_STATUS_OPERATIONAL: + case HL_DEVICE_STATUS_IN_DEVICE_CREATION: + default: + return true; + } +} + +static void print_idle_status_mask(struct hl_device *hdev, const char *message, + u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE]) +{ + u32 pad_width[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {}; + + BUILD_BUG_ON(HL_BUSY_ENGINES_MASK_EXT_SIZE != 4); + + pad_width[3] = idle_mask[3] ? 16 : 0; + pad_width[2] = idle_mask[2] || pad_width[3] ? 16 : 0; + pad_width[1] = idle_mask[1] || pad_width[2] ? 16 : 0; + pad_width[0] = idle_mask[0] || pad_width[1] ? 16 : 0; + + dev_err(hdev->dev, "%s (mask %0*llx_%0*llx_%0*llx_%0*llx)\n", + message, pad_width[3], idle_mask[3], pad_width[2], idle_mask[2], + pad_width[1], idle_mask[1], pad_width[0], idle_mask[0]); +} + static void hpriv_release(struct kref *ref) { u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; - bool device_is_idle = true; + bool reset_device, device_is_idle = true; struct hl_fpriv *hpriv; struct hl_device *hdev; @@ -375,15 +423,19 @@ static void hpriv_release(struct kref *ref) mutex_destroy(&hpriv->ctx_lock); mutex_destroy(&hpriv->restore_phase_mutex); - if ((!hdev->pldm) && (hdev->pdev) && - (!hdev->asic_funcs->is_device_idle(hdev, - idle_mask, - HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) { - dev_err(hdev->dev, - "device not idle after user context is closed (0x%llx_%llx)\n", - idle_mask[1], idle_mask[0]); + /* Device should be reset if reset-upon-device-release is enabled, or if there is a pending + * reset that waits for device release. + */ + reset_device = hdev->reset_upon_device_release || hdev->reset_info.watchdog_active; - device_is_idle = false; + /* Unless device is reset in any case, check idle status and reset if device is not idle */ + if (!reset_device && hdev->pdev && !hdev->pldm) + device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask, + HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL); + if (!device_is_idle) { + print_idle_status_mask(hdev, "device is not idle after user context is closed", + idle_mask); + reset_device = true; } /* We need to remove the user from the list to make sure the reset process won't @@ -399,9 +451,10 @@ static void hpriv_release(struct kref *ref) list_del(&hpriv->dev_node); mutex_unlock(&hdev->fpriv_list_lock); - if (!device_is_idle || hdev->reset_upon_device_release) { + if (reset_device) { hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE); } else { + /* Scrubbing is handled within hl_device_reset(), so here need to do it directly */ int rc = hdev->asic_funcs->scrub_device_mem(hdev); if (rc) @@ -468,9 +521,10 @@ static int hl_device_release(struct inode *inode, struct file *filp) hdev->compute_ctx_in_release = 1; - if (!hl_hpriv_put(hpriv)) - dev_notice(hdev->dev, - "User process closed FD but device still in use\n"); + if (!hl_hpriv_put(hpriv)) { + dev_notice(hdev->dev, "User process closed FD but device still in use\n"); + hl_device_reset(hdev, HL_DRV_RESET_HARD); + } hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif; @@ -658,17 +712,42 @@ static void device_hard_reset_pending(struct work_struct *work) flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR; rc = hl_device_reset(hdev, flags); + if ((rc == -EBUSY) && !hdev->device_fini_pending) { - dev_info(hdev->dev, - "Could not reset device. will try again in %u seconds", - HL_PENDING_RESET_PER_SEC); + struct hl_ctx *ctx = hl_get_compute_ctx(hdev); - queue_delayed_work(device_reset_work->wq, - &device_reset_work->reset_work, - msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); + if (ctx) { + /* The read refcount value should subtracted by one, because the read is + * protected with hl_get_compute_ctx(). + */ + dev_info(hdev->dev, + "Could not reset device (compute_ctx refcount %u). will try again in %u seconds", + kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC); + hl_ctx_put(ctx); + } else { + dev_info(hdev->dev, "Could not reset device. will try again in %u seconds", + HL_PENDING_RESET_PER_SEC); + } + + queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work, + msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); } } +static void device_release_watchdog_func(struct work_struct *work) +{ + struct hl_device_reset_work *device_release_watchdog_work = + container_of(work, struct hl_device_reset_work, reset_work.work); + struct hl_device *hdev = device_release_watchdog_work->hdev; + u32 flags; + + dev_dbg(hdev->dev, "Device wasn't released in time. Initiate device reset.\n"); + + flags = device_release_watchdog_work->flags | HL_DRV_RESET_FROM_WD_THR; + + hl_device_reset(hdev, flags); +} + /* * device_early_init - do some early initialization for the habanalabs device * @@ -699,9 +778,10 @@ static int device_early_init(struct hl_device *hdev) gaudi2_set_asic_funcs(hdev); strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name)); break; - case ASIC_GAUDI2_SEC: + case ASIC_GAUDI2B: gaudi2_set_asic_funcs(hdev); - strscpy(hdev->asic_name, "GAUDI2 SEC", sizeof(hdev->asic_name)); + strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name)); + break; break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", @@ -737,7 +817,7 @@ static int device_early_init(struct hl_device *hdev) } } - hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0); + hdev->eq_wq = create_singlethread_workqueue("hl-events"); if (hdev->eq_wq == NULL) { dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); rc = -ENOMEM; @@ -760,8 +840,8 @@ static int device_early_init(struct hl_device *hdev) goto free_cs_cmplt_wq; } - hdev->pf_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); - if (!hdev->pf_wq) { + hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); + if (!hdev->prefetch_wq) { dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); rc = -ENOMEM; goto free_ts_free_wq; @@ -771,7 +851,7 @@ static int device_early_init(struct hl_device *hdev) GFP_KERNEL); if (!hdev->hl_chip_info) { rc = -ENOMEM; - goto free_pf_wq; + goto free_prefetch_wq; } rc = hl_mmu_if_set_funcs(hdev); @@ -780,19 +860,21 @@ static int device_early_init(struct hl_device *hdev) hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); - hdev->device_reset_work.wq = - create_singlethread_workqueue("hl_device_reset"); - if (!hdev->device_reset_work.wq) { + hdev->reset_wq = create_singlethread_workqueue("hl_device_reset"); + if (!hdev->reset_wq) { rc = -ENOMEM; dev_err(hdev->dev, "Failed to create device reset WQ\n"); goto free_cb_mgr; } - INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, - device_hard_reset_pending); + INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending); hdev->device_reset_work.hdev = hdev; hdev->device_fini_pending = 0; + INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work, + device_release_watchdog_func); + hdev->device_release_watchdog_work.hdev = hdev; + mutex_init(&hdev->send_cpu_message_lock); mutex_init(&hdev->debug_lock); INIT_LIST_HEAD(&hdev->cs_mirror_list); @@ -810,8 +892,8 @@ free_cb_mgr: hl_mem_mgr_fini(&hdev->kernel_mem_mgr); free_chip_info: kfree(hdev->hl_chip_info); -free_pf_wq: - destroy_workqueue(hdev->pf_wq); +free_prefetch_wq: + destroy_workqueue(hdev->prefetch_wq); free_ts_free_wq: destroy_workqueue(hdev->ts_free_obj_wq); free_cs_cmplt_wq: @@ -854,11 +936,11 @@ static void device_early_fini(struct hl_device *hdev) kfree(hdev->hl_chip_info); - destroy_workqueue(hdev->pf_wq); + destroy_workqueue(hdev->prefetch_wq); destroy_workqueue(hdev->ts_free_obj_wq); destroy_workqueue(hdev->cs_cmplt_wq); destroy_workqueue(hdev->eq_wq); - destroy_workqueue(hdev->device_reset_work.wq); + destroy_workqueue(hdev->reset_wq); for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) destroy_workqueue(hdev->cq_wq[i]); @@ -962,11 +1044,16 @@ static void device_late_fini(struct hl_device *hdev) int hl_device_utilization(struct hl_device *hdev, u32 *utilization) { - u64 max_power, curr_power, dc_power, dividend; + u64 max_power, curr_power, dc_power, dividend, divisor; int rc; max_power = hdev->max_power; dc_power = hdev->asic_prop.dc_power_default; + divisor = max_power - dc_power; + if (!divisor) { + dev_warn(hdev->dev, "device utilization is not supported\n"); + return -EOPNOTSUPP; + } rc = hl_fw_cpucp_power_get(hdev, &curr_power); if (rc) @@ -975,7 +1062,7 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization) curr_power = clamp(curr_power, dc_power, max_power); dividend = (curr_power - dc_power) * 100; - *utilization = (u32) div_u64(dividend, (max_power - dc_power)); + *utilization = (u32) div_u64(dividend, divisor); return 0; } @@ -1053,7 +1140,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r hl_cs_rollback_all(hdev, skip_wq_flush); /* flush the MMU prefetch workqueue */ - flush_workqueue(hdev->pf_wq); + flush_workqueue(hdev->prefetch_wq); /* Release all pending user interrupts, each pending user interrupt * holds a reference to user context @@ -1264,6 +1351,10 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) { u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + /* No consecutive mechanism when user context exists */ + if (hdev->is_compute_ctx_active) + return; + /* * 'reset cause' is being updated here, because getting here * means that it's the 1st time and the last time we're here @@ -1337,8 +1428,8 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) int hl_device_reset(struct hl_device *hdev, u32 flags) { bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, - reset_upon_device_release = false, schedule_hard_reset = false, - skip_wq_flush, delay_reset; + reset_upon_device_release = false, schedule_hard_reset = false, delay_reset, + from_dev_release, from_watchdog_thread; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; struct hl_ctx *ctx; int i, rc; @@ -1351,8 +1442,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) hard_reset = !!(flags & HL_DRV_RESET_HARD); from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR); fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW); - skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE); + from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE); delay_reset = !!(flags & HL_DRV_RESET_DELAY); + from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR); if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { hard_instead_soft = true; @@ -1409,6 +1501,23 @@ do_reset: spin_unlock(&hdev->reset_info.lock); + /* Cancel the device release watchdog work if required. + * In case of reset-upon-device-release while the release watchdog work is + * scheduled, do hard-reset instead of compute-reset. + */ + if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) { + hdev->reset_info.watchdog_active = 0; + if (!from_watchdog_thread) + cancel_delayed_work_sync( + &hdev->device_release_watchdog_work.reset_work); + + if (from_dev_release) { + flags |= HL_DRV_RESET_HARD; + flags &= ~HL_DRV_RESET_DEV_RELEASE; + hard_reset = true; + } + } + if (delay_reset) usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1); @@ -1439,13 +1548,12 @@ again: * Because the reset function can't run from heartbeat work, * we need to call the reset function from a dedicated work. */ - queue_delayed_work(hdev->device_reset_work.wq, - &hdev->device_reset_work.reset_work, 0); + queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0); return 0; } - cleanup_resources(hdev, hard_reset, fw_reset, skip_wq_flush); + cleanup_resources(hdev, hard_reset, fw_reset, from_dev_release); kill_processes: if (hard_reset) { @@ -1581,9 +1689,8 @@ kill_processes: /* If device is not idle fail the reset process */ if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, - HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { - dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n", - idle_mask[1], idle_mask[0]); + HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { + print_idle_status_mask(hdev, "device is not idle after reset", idle_mask); rc = -EIO; goto out_err; } @@ -1658,18 +1765,19 @@ kill_processes: * the device will be operational although it shouldn't be */ hdev->asic_funcs->enable_events_from_fw(hdev); - } else if (!reset_upon_device_release) { - hdev->reset_info.compute_reset_cnt++; - } - - if (schedule_hard_reset) { - dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); - flags = hdev->reset_info.hard_reset_schedule_flags; - hdev->reset_info.hard_reset_schedule_flags = 0; - hdev->disabled = true; - hard_reset = true; - handle_reset_trigger(hdev, flags); - goto again; + } else { + if (!reset_upon_device_release) + hdev->reset_info.compute_reset_cnt++; + + if (schedule_hard_reset) { + dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); + flags = hdev->reset_info.hard_reset_schedule_flags; + hdev->reset_info.hard_reset_schedule_flags = 0; + hdev->disabled = true; + hard_reset = true; + handle_reset_trigger(hdev, flags); + goto again; + } } return 0; @@ -1706,6 +1814,73 @@ out_err: return rc; } +/* + * hl_device_cond_reset() - conditionally reset the device. + * @hdev: pointer to habanalabs device structure. + * @reset_flags: reset flags. + * @event_mask: events to notify user about. + * + * Conditionally reset the device, or alternatively schedule a watchdog work to reset the device + * unless another reset precedes it. + */ +int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask) +{ + struct hl_ctx *ctx = NULL; + + /* Device release watchdog is only for hard reset */ + if (!(flags & HL_DRV_RESET_HARD) && hdev->asic_prop.allow_inference_soft_reset) + goto device_reset; + + /* F/W reset cannot be postponed */ + if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW) + goto device_reset; + + /* Device release watchdog is relevant only if user exists and gets a reset notification */ + if (!(event_mask & HL_NOTIFIER_EVENT_DEVICE_RESET)) { + dev_err(hdev->dev, "Resetting device without a reset indication to user\n"); + goto device_reset; + } + + ctx = hl_get_compute_ctx(hdev); + if (!ctx || !ctx->hpriv->notifier_event.eventfd) + goto device_reset; + + /* Schedule the device release watchdog work unless reset is already in progress or if the + * work is already scheduled. + */ + spin_lock(&hdev->reset_info.lock); + if (hdev->reset_info.in_reset) { + spin_unlock(&hdev->reset_info.lock); + goto device_reset; + } + + if (hdev->reset_info.watchdog_active) + goto out; + + hdev->device_release_watchdog_work.flags = flags; + dev_dbg(hdev->dev, "Device is going to be reset in %u sec unless being released\n", + hdev->device_release_watchdog_timeout_sec); + schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work, + msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000)); + hdev->reset_info.watchdog_active = 1; +out: + spin_unlock(&hdev->reset_info.lock); + + hl_notifier_event_send_all(hdev, event_mask); + + hl_ctx_put(ctx); + + return 0; + +device_reset: + if (event_mask) + hl_notifier_event_send_all(hdev, event_mask); + if (ctx) + hl_ctx_put(ctx); + + return hl_device_reset(hdev, flags); +} + static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask) { mutex_lock(¬ifier_event->lock); @@ -1728,6 +1903,11 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask) { struct hl_fpriv *hpriv; + if (!event_mask) { + dev_warn(hdev->dev, "Skip sending zero event"); + return; + } + mutex_lock(&hdev->fpriv_list_lock); list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) @@ -1898,6 +2078,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) hdev->asic_funcs->state_dump_init(hdev); + hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC; + hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; hl_debugfs_add_device(hdev); @@ -2118,6 +2300,8 @@ void hl_device_fini(struct hl_device *hdev) } } + cancel_delayed_work_sync(&hdev->device_release_watchdog_work.reset_work); + /* Disable PCI access from device F/W so it won't send us additional * interrupts. We disable MSI/MSI-X at the halt_engines function and we * can't have the F/W sending us interrupts after that. We need to @@ -2144,14 +2328,16 @@ void hl_device_fini(struct hl_device *hdev) */ dev_info(hdev->dev, "Waiting for all processes to exit (timeout of %u seconds)", - HL_PENDING_RESET_LONG_SEC); + HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI); - rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false); + hdev->process_kill_trial_cnt = 0; + rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, false); if (rc) { dev_crit(hdev->dev, "Failed to kill all open processes\n"); device_disable_open_processes(hdev, false); } + hdev->process_kill_trial_cnt = 0; rc = device_kill_open_processes(hdev, 0, true); if (rc) { dev_crit(hdev->dev, "Failed to kill all control device open processes\n"); @@ -2177,6 +2363,8 @@ void hl_device_fini(struct hl_device *hdev) hl_mmu_fini(hdev); + vfree(hdev->captured_err_info.pgf_info.user_mappings); + hl_eq_fini(hdev, &hdev->event_queue); kfree(hdev->shadow_cs_queue); @@ -2231,3 +2419,117 @@ inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) { writel(val, hdev->rmmio + reg); } + +void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags) +{ + if (num_of_engines > HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR) { + dev_err(hdev->dev, + "Number of possible razwi initiators (%u) exceeded limit (%u)\n", + num_of_engines, HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR); + return; + } + + /* In case it's the first razwi since the device was opened, capture its parameters */ + if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info_recorded, 0, 1)) + return; + + hdev->captured_err_info.razwi.timestamp = ktime_to_ns(ktime_get()); + hdev->captured_err_info.razwi.addr = addr; + hdev->captured_err_info.razwi.num_of_possible_engines = num_of_engines; + memcpy(&hdev->captured_err_info.razwi.engine_id[0], &engine_id[0], + num_of_engines * sizeof(u16)); + hdev->captured_err_info.razwi.flags = flags; +} + +void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags, u64 *event_mask) +{ + hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags); + + if (event_mask) + *event_mask |= HL_NOTIFIER_EVENT_RAZWI; +} + +static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu) +{ + struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info; + struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; + struct hl_vm_hash_node *hnode; + struct hl_userptr *userptr; + enum vm_type *vm_type; + struct hl_ctx *ctx; + u32 map_idx = 0; + int i; + + /* Reset previous session count*/ + pgf_info->num_of_user_mappings = 0; + + ctx = hl_get_compute_ctx(hdev); + if (!ctx) { + dev_err(hdev->dev, "Can't get user context for user mappings\n"); + return; + } + + mutex_lock(&ctx->mem_hash_lock); + hash_for_each(ctx->mem_hash, i, hnode, node) { + vm_type = hnode->ptr; + if (((*vm_type == VM_TYPE_USERPTR) && is_pmmu) || + ((*vm_type == VM_TYPE_PHYS_PACK) && !is_pmmu)) + pgf_info->num_of_user_mappings++; + + } + + if (!pgf_info->num_of_user_mappings) + goto finish; + + /* In case we already allocated in previous session, need to release it before + * allocating new buffer. + */ + vfree(pgf_info->user_mappings); + pgf_info->user_mappings = + vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping)); + if (!pgf_info->user_mappings) { + pgf_info->num_of_user_mappings = 0; + goto finish; + } + + hash_for_each(ctx->mem_hash, i, hnode, node) { + vm_type = hnode->ptr; + if ((*vm_type == VM_TYPE_USERPTR) && (is_pmmu)) { + userptr = hnode->ptr; + pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; + pgf_info->user_mappings[map_idx].size = userptr->size; + map_idx++; + } else if ((*vm_type == VM_TYPE_PHYS_PACK) && (!is_pmmu)) { + phys_pg_pack = hnode->ptr; + pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; + pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size; + map_idx++; + } + } +finish: + mutex_unlock(&ctx->mem_hash_lock); + hl_ctx_put(ctx); +} + +void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu) +{ + /* Capture only the first page fault */ + if (atomic_cmpxchg(&hdev->captured_err_info.pgf_info_recorded, 0, 1)) + return; + + hdev->captured_err_info.pgf_info.pgf.timestamp = ktime_to_ns(ktime_get()); + hdev->captured_err_info.pgf_info.pgf.addr = addr; + hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id; + hl_capture_user_mappings(hdev, is_pmmu); +} + +void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu, + u64 *event_mask) +{ + hl_capture_page_fault(hdev, addr, eng_id, is_pmmu); + + if (event_mask) + *event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT; +} diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 2de6a9bd564d..228b92278e48 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -12,6 +12,7 @@ #include <linux/crc32.h> #include <linux/slab.h> #include <linux/ctype.h> +#include <linux/vmalloc.h> #define FW_FILE_MAX_SIZE 0x1400000 /* maximum size of 20MB */ @@ -323,6 +324,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, if (!prop->supports_advanced_cpucp_rc) { dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n", rc, opcode); + rc = -EIO; goto scrub_descriptor; } @@ -615,16 +617,12 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, if (sts_val & CPU_BOOT_DEV_STS0_ENABLED) dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val); - /* All warnings should go here in order not to reach the unknown error validation */ if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - EEPROM failure detected, default settings applied\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_EEPROM_FAIL; + dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n"); + err_exists = true; } + /* All warnings should go here in order not to reach the unknown error validation */ if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { dev_warn(hdev->dev, "Device boot warning - Skipped DRAM initialization\n"); @@ -1782,6 +1780,8 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, /* first send clear command to clean former commands */ rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader); + if (rc) + return rc; /* send the actual command */ hl_fw_dynamic_send_cmd(hdev, fw_loader, cmd, size); @@ -1988,10 +1988,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, struct fw_load_mgr *fw_loader) { struct lkd_fw_comms_desc *fw_desc; + void __iomem *src, *temp_fw_desc; struct pci_mem_region *region; struct fw_response *response; + u16 fw_data_size; enum pci_region region_id; - void __iomem *src; int rc; fw_desc = &fw_loader->dynamic_loader.comm_desc; @@ -2018,9 +2019,29 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, fw_loader->dynamic_loader.fw_desc_valid = false; src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + response->ram_offset; + + /* + * We do the copy of the fw descriptor in 2 phases: + * 1. copy the header + data info according to our lkd_fw_comms_desc definition. + * then we're able to read the actual data size provided by fw. + * this is needed for cases where data in descriptor was changed(add/remove) + * in embedded specs header file before updating lkd copy of the header file + * 2. copy descriptor to temporary buffer with aligned size and send it to validation + */ memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc)); + fw_data_size = le16_to_cpu(fw_desc->header.size); + + temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size); + if (!temp_fw_desc) + return -ENOMEM; - return hl_fw_dynamic_validate_descriptor(hdev, fw_loader, fw_desc); + memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size); + + rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader, + (struct lkd_fw_comms_desc *) temp_fw_desc); + vfree(temp_fw_desc); + + return rc; } /** @@ -2507,7 +2528,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, struct fw_load_mgr *fw_loader) { struct cpu_dyn_regs *dyn_regs; - int rc; + int rc, fw_error_rc; dev_info(hdev->dev, "Loading %sfirmware to device, may take some time...\n", @@ -2607,14 +2628,17 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, hl_fw_dynamic_update_linux_interrupt_if(hdev); - return 0; - protocol_err: - if (fw_loader->dynamic_loader.fw_desc_valid) - fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0), + if (fw_loader->dynamic_loader.fw_desc_valid) { + fw_error_rc = fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0), le32_to_cpu(dyn_regs->cpu_boot_err1), le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); + + if (fw_error_rc) + return fw_error_rc; + } + return rc; } @@ -2983,7 +3007,7 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void int rc; req_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, size, &req_dma_addr); - if (!data) { + if (!req_cpu_addr) { dev_err(hdev->dev, "Failed to allocate DMA memory for CPU-CP packet %u\n", packet_id); return -ENOMEM; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 58c95b13be69..e2527d976ee0 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -50,9 +50,14 @@ struct hl_fpriv; #define HL_MMAP_OFFSET_VALUE_MASK (0x1FFFFFFFFFFFull >> PAGE_SHIFT) #define HL_MMAP_OFFSET_VALUE_GET(off) (off & HL_MMAP_OFFSET_VALUE_MASK) -#define HL_PENDING_RESET_PER_SEC 10 -#define HL_PENDING_RESET_MAX_TRIALS 60 /* 10 minutes */ -#define HL_PENDING_RESET_LONG_SEC 60 +#define HL_PENDING_RESET_PER_SEC 10 +#define HL_PENDING_RESET_MAX_TRIALS 60 /* 10 minutes */ +#define HL_PENDING_RESET_LONG_SEC 60 +/* + * In device fini, wait 10 minutes for user processes to be terminated after we kill them. + * This is needed to prevent situation of clearing resources while user processes are still alive. + */ +#define HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI 600 #define HL_HARD_RESET_MAX_TIMEOUT 120 #define HL_PLDM_HARD_RESET_MAX_TIMEOUT (HL_HARD_RESET_MAX_TIMEOUT * 3) @@ -191,6 +196,9 @@ enum hl_mmu_enablement { * * - HL_DRV_RESET_DELAY * Set if a delay should be added before the reset + * + * - HL_DRV_RESET_FROM_WD_THR + * Set if the caller is the device release watchdog thread */ #define HL_DRV_RESET_HARD (1 << 0) @@ -201,6 +209,7 @@ enum hl_mmu_enablement { #define HL_DRV_RESET_BYPASS_REQ_TO_FW (1 << 5) #define HL_DRV_RESET_FW_FATAL_ERR (1 << 6) #define HL_DRV_RESET_DELAY (1 << 7) +#define HL_DRV_RESET_FROM_WD_THR (1 << 8) /* * Security @@ -1188,7 +1197,7 @@ struct hl_dec { * @ASIC_GAUDI: Gaudi device (HL-2000). * @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000). * @ASIC_GAUDI2: Gaudi2 device. - * @ASIC_GAUDI2_SEC: Gaudi2 secured device. + * @ASIC_GAUDI2B: Gaudi2B device. */ enum hl_asic_type { ASIC_INVALID, @@ -1196,7 +1205,7 @@ enum hl_asic_type { ASIC_GAUDI, ASIC_GAUDI_SEC, ASIC_GAUDI2, - ASIC_GAUDI2_SEC, + ASIC_GAUDI2B, }; struct hl_cs_parser; @@ -2489,13 +2498,9 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); #define WREG32_AND(reg, and) WREG32_P(reg, 0, and) #define WREG32_OR(reg, or) WREG32_P(reg, or, ~(or)) -#define RMWREG32(reg, val, mask) \ - do { \ - u32 tmp_ = RREG32(reg); \ - tmp_ &= ~(mask); \ - tmp_ |= ((val) << __ffs(mask)); \ - WREG32(reg, tmp_); \ - } while (0) +#define RMWREG32_SHIFTED(reg, val, mask) WREG32_P(reg, val, ~(mask)) + +#define RMWREG32(reg, val, mask) RMWREG32_SHIFTED(reg, (val) << __ffs(mask), mask) #define RREG32_MASK(reg, mask) ((RREG32(reg) & mask) >> __ffs(mask)) @@ -2528,7 +2533,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); break; \ (val) = __elbi_read; \ } else {\ - (val) = RREG32((u32)(addr)); \ + (val) = RREG32(lower_32_bits(addr)); \ } \ if (cond) \ break; \ @@ -2539,7 +2544,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); break; \ (val) = __elbi_read; \ } else {\ - (val) = RREG32((u32)(addr)); \ + (val) = RREG32(lower_32_bits(addr)); \ } \ break; \ } \ @@ -2594,7 +2599,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); if (__rc) \ break; \ } else { \ - __read_val = RREG32((u32)(addr_arr)[__arr_idx]); \ + __read_val = RREG32(lower_32_bits(addr_arr[__arr_idx])); \ } \ if (__read_val == (expected_val)) \ __elem_bitmask &= ~BIT_ULL(__arr_idx); \ @@ -2682,17 +2687,15 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); struct hwmon_chip_info; /** - * struct hl_device_reset_work - reset workqueue task wrapper. - * @wq: work queue for device reset procedure. + * struct hl_device_reset_work - reset work wrapper. * @reset_work: reset work to be done. * @hdev: habanalabs device structure. * @flags: reset flags. */ struct hl_device_reset_work { - struct workqueue_struct *wq; - struct delayed_work reset_work; - struct hl_device *hdev; - u32 flags; + struct delayed_work reset_work; + struct hl_device *hdev; + u32 flags; }; /** @@ -2811,7 +2814,7 @@ struct hl_mmu_funcs { /** * struct hl_prefetch_work - prefetch work structure handler - * @pf_work: actual work struct. + * @prefetch_work: actual work struct. * @ctx: compute context. * @va: virtual address to pre-fetch. * @size: pre-fetch size. @@ -2819,7 +2822,7 @@ struct hl_mmu_funcs { * @asid: ASID for maintenance operation. */ struct hl_prefetch_work { - struct work_struct pf_work; + struct work_struct prefetch_work; struct hl_ctx *ctx; u64 va; u64 size; @@ -2925,30 +2928,6 @@ struct cs_timeout_info { u64 seq; }; -/** - * struct razwi_info - info about last razwi error occurred. - * @timestamp: razwi timestamp. - * @write_enable: if set writing to razwi parameters in the structure is enabled. - * otherwise - disabled, so the first (root cause) razwi will not be overwritten. - * @addr: address that caused razwi. - * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does - * not have engine id it will be set to U16_MAX. - * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible - * engines which one them caused the razwi. In that case, it will contain the - * second possible engine id, otherwise it will be set to U16_MAX. - * @non_engine_initiator: in case the initiator of the razwi does not have engine id. - * @type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. - */ -struct razwi_info { - ktime_t timestamp; - atomic_t write_enable; - u64 addr; - u16 engine_id_1; - u16 engine_id_2; - u8 non_engine_initiator; - u8 type; -}; - #define MAX_QMAN_STREAMS_INFO 4 #define OPCODE_INFO_MAX_ADDR_SIZE 8 /** @@ -2982,15 +2961,37 @@ struct undefined_opcode_info { }; /** + * struct page_fault_info - info about page fault + * @pgf_info: page fault information. + * @user_mappings: buffer containing user mappings. + * @num_of_user_mappings: number of user mappings. + */ +struct page_fault_info { + struct hl_page_fault_info pgf; + struct hl_user_mapping *user_mappings; + u64 num_of_user_mappings; +}; + +/** * struct hl_error_info - holds information collected during an error. * @cs_timeout: CS timeout error information. * @razwi: razwi information. + * @razwi_info_recorded: if set writing to razwi information is enabled. + * otherwise - disabled, so the first (root cause) razwi will not be + * overwritten. * @undef_opcode: undefined opcode information + * @pgf_info: page fault information. + * @pgf_info_recorded: if set writing to page fault information is enabled. + * otherwise - disabled, so the first (root cause) page fault will not be + * overwritten. */ struct hl_error_info { struct cs_timeout_info cs_timeout; - struct razwi_info razwi; + struct hl_info_razwi_event razwi; + atomic_t razwi_info_recorded; struct undefined_opcode_info undef_opcode; + struct page_fault_info pgf_info; + atomic_t pgf_info_recorded; }; /** @@ -3013,6 +3014,7 @@ struct hl_error_info { * same cause. * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to * complete instead. + * @watchdog_active: true if a device release watchdog work is scheduled. */ struct hl_reset_info { spinlock_t lock; @@ -3023,12 +3025,11 @@ struct hl_reset_info { u8 in_compute_reset; u8 needs_reset; u8 hard_reset_pending; - u8 curr_reset_cause; u8 prev_reset_trigger; u8 reset_trigger_repeated; - u8 skip_reset_on_timeout; + u8 watchdog_active; }; /** @@ -3044,6 +3045,8 @@ struct hl_reset_info { * @dev_ctrl: related kernel device structure for the control device * @work_heartbeat: delayed work for CPU-CP is-alive check. * @device_reset_work: delayed work which performs hard reset + * @device_release_watchdog_work: watchdog work that performs hard reset if user doesn't release + * device upon certain error cases. * @asic_name: ASIC specific name. * @asic_type: ASIC specific type. * @completion_queue: array of hl_cq. @@ -3062,7 +3065,8 @@ struct hl_reset_info { * @cs_cmplt_wq: work queue of CS completions for executing work in process * context. * @ts_free_obj_wq: work queue for timestamp registration objects release. - * @pf_wq: work queue for MMU pre-fetch operations. + * @prefetch_wq: work queue for MMU pre-fetch operations. + * @reset_wq: work queue for device reset procedure. * @kernel_ctx: Kernel driver context structure. * @kernel_queues: array of hl_hw_queue. * @cs_mirror_list: CS mirror list for TDR. @@ -3152,6 +3156,7 @@ struct hl_reset_info { * indicates which decoder engines are binned-out * @edma_binning: contains mask of edma engines that is received from the f/w which * indicates which edma engines are binned-out + * @device_release_watchdog_timeout_sec: device release watchdog timeout value in seconds. * @id: device minor. * @id_control: minor of the control device. * @cdev_idx: char device index. Used for setting its name. @@ -3221,6 +3226,7 @@ struct hl_device { struct device *dev_ctrl; struct delayed_work work_heartbeat; struct hl_device_reset_work device_reset_work; + struct hl_device_reset_work device_release_watchdog_work; char asic_name[HL_STR_MAX]; char status[HL_DEV_STS_MAX][HL_STR_MAX]; enum hl_asic_type asic_type; @@ -3233,7 +3239,8 @@ struct hl_device { struct workqueue_struct *eq_wq; struct workqueue_struct *cs_cmplt_wq; struct workqueue_struct *ts_free_obj_wq; - struct workqueue_struct *pf_wq; + struct workqueue_struct *prefetch_wq; + struct workqueue_struct *reset_wq; struct hl_ctx *kernel_ctx; struct hl_hw_queue *kernel_queues; struct list_head cs_mirror_list; @@ -3314,6 +3321,7 @@ struct hl_device { u32 high_pll; u32 decoder_binning; u32 edma_binning; + u32 device_release_watchdog_timeout_sec; u16 id; u16 id_control; u16 cdev_idx; @@ -3488,6 +3496,8 @@ void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_ int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir); void hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir); +int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type, enum pci_region region_type, bool set_dram_bar); int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val, enum debugfs_access_type acc_type); int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, @@ -3496,6 +3506,8 @@ int hl_device_open(struct inode *inode, struct file *filp); int hl_device_open_ctrl(struct inode *inode, struct file *filp); bool hl_device_operational(struct hl_device *hdev, enum hl_device_status *status); +bool hl_ctrl_device_operational(struct hl_device *hdev, + enum hl_device_status *status); enum hl_device_status hl_device_status(struct hl_device *hdev); int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable); int hl_hw_queues_create(struct hl_device *hdev); @@ -3549,6 +3561,7 @@ void hl_device_fini(struct hl_device *hdev); int hl_device_suspend(struct hl_device *hdev); int hl_device_resume(struct hl_device *hdev); int hl_device_reset(struct hl_device *hdev, u32 flags); +int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask); void hl_hpriv_get(struct hl_fpriv *hpriv); int hl_hpriv_put(struct hl_fpriv *hpriv); int hl_device_utilization(struct hl_device *hdev, u32 *utilization); @@ -3762,7 +3775,8 @@ void hl_sysfs_add_dev_vrm_attr(struct hl_device *hdev, struct attribute_group *d void hw_sob_get(struct hl_hw_sob *hw_sob); void hw_sob_put(struct hl_hw_sob *hw_sob); -void hl_encaps_handle_do_release(struct kref *ref); +void hl_encaps_release_handle_and_put_ctx(struct kref *ref); +void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref); void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, struct hl_cs *cs, struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl); @@ -3798,6 +3812,13 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp, void *args); __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...); +void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags); +void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags, u64 *event_mask); +void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu); +void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu, + u64 *event_mask); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 112632afe7d5..7815c60df54e 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) "habanalabs: " fmt #include "habanalabs.h" +#include "../include/hw_ip/pci/pci_general.h" #include <linux/pci.h> #include <linux/aer.h> @@ -74,16 +75,17 @@ MODULE_DEVICE_TABLE(pci, ids); /* * get_asic_type - translate device id to asic type * - * @device: id of the PCI device + * @hdev: pointer to habanalabs device structure. * - * Translate device id to asic type. + * Translate device id and revision id to asic type. * In case of unidentified device, return -1 */ -static enum hl_asic_type get_asic_type(u16 device) +static enum hl_asic_type get_asic_type(struct hl_device *hdev) { - enum hl_asic_type asic_type; + struct pci_dev *pdev = hdev->pdev; + enum hl_asic_type asic_type = ASIC_INVALID; - switch (device) { + switch (pdev->device) { case PCI_IDS_GOYA: asic_type = ASIC_GOYA; break; @@ -94,10 +96,18 @@ static enum hl_asic_type get_asic_type(u16 device) asic_type = ASIC_GAUDI_SEC; break; case PCI_IDS_GAUDI2: - asic_type = ASIC_GAUDI2; + switch (pdev->revision) { + case REV_ID_A: + asic_type = ASIC_GAUDI2; + break; + case REV_ID_B: + asic_type = ASIC_GAUDI2B; + break; + default: + break; + } break; default: - asic_type = ASIC_INVALID; break; } @@ -212,7 +222,8 @@ int hl_device_open(struct inode *inode, struct file *filp) hl_debugfs_add_file(hpriv); atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); - atomic_set(&hdev->captured_err_info.razwi.write_enable, 1); + atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0); + atomic_set(&hdev->captured_err_info.pgf_info_recorded, 0); hdev->captured_err_info.undef_opcode.write_enable = true; hdev->open_counter++; @@ -270,9 +281,9 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) mutex_lock(&hdev->fpriv_ctrl_list_lock); - if (!hl_device_operational(hdev, NULL)) { + if (!hl_ctrl_device_operational(hdev, NULL)) { dev_dbg_ratelimited(hdev->dev_ctrl, - "Can't open %s because it is disabled or in reset\n", + "Can't open %s because it is disabled\n", dev_name(hdev->dev_ctrl)); rc = -EPERM; goto out_err; @@ -415,7 +426,7 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) /* First, we must find out which ASIC are we handling. This is needed * to configure the behavior of the driver (kernel parameters) */ - hdev->asic_type = get_asic_type(pdev->device); + hdev->asic_type = get_asic_type(hdev); if (hdev->asic_type == ASIC_INVALID) { dev_err(&pdev->dev, "Unsupported ASIC\n"); rc = -ENODEV; @@ -594,15 +605,16 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) switch (state) { case pci_channel_io_normal: + dev_warn(hdev->dev, "PCI normal state error detected\n"); return PCI_ERS_RESULT_CAN_RECOVER; case pci_channel_io_frozen: - dev_warn(hdev->dev, "frozen state error detected\n"); + dev_warn(hdev->dev, "PCI frozen state error detected\n"); result = PCI_ERS_RESULT_NEED_RESET; break; case pci_channel_io_perm_failure: - dev_warn(hdev->dev, "failure state error detected\n"); + dev_warn(hdev->dev, "PCI failure state error detected\n"); result = PCI_ERS_RESULT_DISCONNECT; break; @@ -638,6 +650,10 @@ static void hl_pci_err_resume(struct pci_dev *pdev) */ static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) { + struct hl_device *hdev = pci_get_drvdata(pdev); + + dev_warn(hdev->dev, "PCI slot reset detected\n"); + return PCI_ERS_RESULT_RECOVERED; } diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 43afe40966e5..b6abfa7761a7 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -10,10 +10,11 @@ #include <uapi/misc/habanalabs.h> #include "habanalabs.h" -#include <linux/kernel.h> #include <linux/fs.h> -#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/pci.h> #include <linux/slab.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = { @@ -105,6 +106,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) hw_ip.edma_enabled_mask = prop->edma_enabled_mask; hw_ip.server_type = prop->server_type; hw_ip.security_enabled = prop->fw_security_enabled; + hw_ip.revision_id = hdev->pdev->revision; return copy_to_user(out, &hw_ip, min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0; @@ -121,6 +123,10 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate, return -EINVAL; arr = hdev->asic_funcs->get_events_stat(hdev, aggregate, &size); + if (!arr) { + dev_err(hdev->dev, "Events info not supported\n"); + return -EOPNOTSUPP; + } return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0; } @@ -603,20 +609,14 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; u32 max_size = args->return_size; - struct hl_info_razwi_event info = {0}; + struct hl_info_razwi_event *info = &hdev->captured_err_info.razwi; void __user *out = (void __user *) (uintptr_t) args->return_pointer; if ((!max_size) || (!out)) return -EINVAL; - info.timestamp = ktime_to_ns(hdev->captured_err_info.razwi.timestamp); - info.addr = hdev->captured_err_info.razwi.addr; - info.engine_id_1 = hdev->captured_err_info.razwi.engine_id_1; - info.engine_id_2 = hdev->captured_err_info.razwi.engine_id_2; - info.no_engine_id = hdev->captured_err_info.razwi.non_engine_initiator; - info.error_type = hdev->captured_err_info.razwi.type; - - return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; + return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_info_razwi_event))) + ? -EFAULT : 0; } static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args) @@ -784,6 +784,42 @@ static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args) return rc; } +static int page_fault_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct hl_page_fault_info *info = &hdev->captured_err_info.pgf_info.pgf; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_page_fault_info))) + ? -EFAULT : 0; +} + +static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + u32 user_buf_size = args->return_size; + struct hl_device *hdev = hpriv->hdev; + struct page_fault_info *pgf_info; + u64 actual_size; + + pgf_info = &hdev->captured_err_info.pgf_info; + args->array_size = pgf_info->num_of_user_mappings; + + if (!out) + return -EINVAL; + + actual_size = pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping); + if (user_buf_size < actual_size) + return -ENOMEM; + + return copy_to_user(out, pgf_info->user_mappings, min_t(size_t, user_buf_size, actual_size)) + ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -843,6 +879,15 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_GET_EVENTS: return events_info(hpriv, args); + case HL_INFO_PAGE_FAULT_EVENT: + return page_fault_info(hpriv, args); + + case HL_INFO_USER_MAPPINGS: + return user_mappings_info(hpriv, args); + + case HL_INFO_UNREGISTER_EVENTFD: + return eventfd_unregister(hpriv, args); + default: break; } @@ -899,9 +944,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_REGISTER_EVENTFD: return eventfd_register(hpriv, args); - case HL_INFO_UNREGISTER_EVENTFD: - return eventfd_unregister(hpriv, args); - case HL_INFO_ENGINE_STATUS: return engine_status_info(hpriv, args); diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index e35cca96bbef..5e9ae7600d75 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -1689,7 +1689,7 @@ static int hl_dmabuf_attach(struct dma_buf *dmabuf, hl_dmabuf = dmabuf->priv; hdev = hl_dmabuf->ctx->hdev; - rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); + rc = pci_p2pdma_distance(hdev->pdev, attachment->dev, true); if (rc < 0) attachment->peer2peer = false; @@ -2109,7 +2109,7 @@ static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args) /* Allocate the internal kernel buffer */ size = num_elements * sizeof(struct hl_user_pending_interrupt); - p = vmalloc(size); + p = vzalloc(size); if (!p) goto free_user_buff; @@ -2507,24 +2507,20 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range **va_ranges, /* * PAGE_SIZE alignment - * it is the callers responsibility to align the addresses if the + * it is the caller's responsibility to align the addresses if the * page size is not a power of 2 */ if (is_power_of_2(page_size)) { - if (start & (PAGE_SIZE - 1)) { - start &= PAGE_MASK; - start += PAGE_SIZE; - } + start = round_up(start, page_size); /* * The end of the range is inclusive, hence we need to align it * to the end of the last full page in the range. For example if * end = 0x3ff5 with page size 0x1000, we need to align it to - * 0x2fff. The remainig 0xff5 bytes do not form a full page. + * 0x2fff. The remaining 0xff5 bytes do not form a full page. */ - if ((end + 1) & (PAGE_SIZE - 1)) - end = ((end + 1) & PAGE_MASK) - 1; + end = round_down(end + 1, page_size) - 1; } if (start >= end) { diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c index cf8946266615..2c1005f74cf4 100644 --- a/drivers/misc/habanalabs/common/mmu/mmu.c +++ b/drivers/misc/habanalabs/common/mmu/mmu.c @@ -635,7 +635,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev) hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]); break; case ASIC_GAUDI2: - case ASIC_GAUDI2_SEC: + case ASIC_GAUDI2B: /* MMUs in Gaudi2 are always host resident */ hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); break; @@ -699,7 +699,7 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, static void hl_mmu_prefetch_work_function(struct work_struct *work) { - struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, pf_work); + struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, prefetch_work); struct hl_ctx *ctx = pfw->ctx; struct hl_device *hdev = ctx->hdev; @@ -723,25 +723,25 @@ put_ctx: int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size) { - struct hl_prefetch_work *handle_pf_work; + struct hl_prefetch_work *handle_prefetch_work; - handle_pf_work = kmalloc(sizeof(*handle_pf_work), GFP_KERNEL); - if (!handle_pf_work) + handle_prefetch_work = kmalloc(sizeof(*handle_prefetch_work), GFP_KERNEL); + if (!handle_prefetch_work) return -ENOMEM; - INIT_WORK(&handle_pf_work->pf_work, hl_mmu_prefetch_work_function); - handle_pf_work->ctx = ctx; - handle_pf_work->va = va; - handle_pf_work->size = size; - handle_pf_work->flags = flags; - handle_pf_work->asid = asid; + INIT_WORK(&handle_prefetch_work->prefetch_work, hl_mmu_prefetch_work_function); + handle_prefetch_work->ctx = ctx; + handle_prefetch_work->va = va; + handle_prefetch_work->size = size; + handle_prefetch_work->flags = flags; + handle_prefetch_work->asid = asid; /* * as actual prefetch is done in a WQ we must get the context (and put it * at the end of the work function) */ hl_ctx_get(ctx); - queue_work(ctx->hdev->pf_wq, &handle_pf_work->pf_work); + queue_work(ctx->hdev->prefetch_wq, &handle_prefetch_work->prefetch_work); return 0; } diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 36e9814139d1..735d8bed0066 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -248,8 +248,8 @@ static ssize_t device_type_show(struct device *dev, case ASIC_GAUDI2: str = "GAUDI2"; break; - case ASIC_GAUDI2_SEC: - str = "GAUDI2 SEC"; + case ASIC_GAUDI2B: + str = "GAUDI2B"; break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 92560414e843..9f5e208701ba 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6505,8 +6505,8 @@ event_not_supported: } static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y, - bool is_write, s32 *engine_id_1, - s32 *engine_id_2) + bool is_write, u16 *engine_id_1, + u16 *engine_id_2) { u32 dma_id[2], dma_offset, err_cause[2], mask, i; @@ -6603,7 +6603,7 @@ unknown_initiator: } static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write, - u32 *engine_id_1, u32 *engine_id_2) + u16 *engine_id_1, u16 *engine_id_2) { u32 val, x_y, axi_id; @@ -6719,8 +6719,8 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool i return "unknown initiator"; } -static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1, - u32 *engine_id_2) +static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u16 *engine_id_1, + u16 *engine_id_2, bool *is_read, bool *is_write) { if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) { @@ -6728,6 +6728,7 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_i "RAZWI event caused by illegal write of %s\n", gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0); + *is_write = true; } if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) { @@ -6735,10 +6736,11 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_i "RAZWI event caused by illegal read of %s\n", gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_READ_VLD, 0); + *is_read = true; } } -static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type) +static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u64 *event_mask) { struct gaudi_device *gaudi = hdev->asic_specific; u32 val; @@ -6753,7 +6755,7 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); - *type = HL_RAZWI_PAGE_FAULT; + hl_handle_page_fault(hdev, *addr, 0, true, event_mask); WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); } @@ -6765,7 +6767,6 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr); - *type = HL_RAZWI_MMU_ACCESS_ERROR; WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0); } @@ -7300,48 +7301,44 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e } static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, - bool razwi) + bool razwi, u64 *event_mask) { - u32 engine_id_1, engine_id_2; + bool is_read = false, is_write = false; + u16 engine_id[2], num_of_razwi_eng = 0; char desc[64] = ""; u64 razwi_addr = 0; - u8 razwi_type; - int rc; + u8 razwi_flags = 0; /* * Init engine id by default as not valid and only if razwi initiated from engine with * engine id it will get valid value. - * Init razwi type to default, will be changed only if razwi caused by page fault of - * MMU access error */ - engine_id_1 = U16_MAX; - engine_id_2 = U16_MAX; - razwi_type = U8_MAX; + engine_id[0] = HL_RAZWI_NA_ENG_ID; + engine_id[1] = HL_RAZWI_NA_ENG_ID; gaudi_get_event_desc(event_type, desc, sizeof(desc)); dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", event_type, desc); if (razwi) { - gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2); - gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type); - - /* In case it's the first razwi, save its parameters*/ - rc = atomic_cmpxchg(&hdev->captured_err_info.razwi.write_enable, 1, 0); - if (rc) { - hdev->captured_err_info.razwi.timestamp = ktime_get(); - hdev->captured_err_info.razwi.addr = razwi_addr; - hdev->captured_err_info.razwi.engine_id_1 = engine_id_1; - hdev->captured_err_info.razwi.engine_id_2 = engine_id_2; - /* - * If first engine id holds non valid value the razwi initiator - * does not have engine id - */ - hdev->captured_err_info.razwi.non_engine_initiator = - (engine_id_1 == U16_MAX); - hdev->captured_err_info.razwi.type = razwi_type; - + gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read, + &is_write); + gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, event_mask); + + if (is_read) + razwi_flags |= HL_RAZWI_READ; + if (is_write) + razwi_flags |= HL_RAZWI_WRITE; + + if (engine_id[0] != HL_RAZWI_NA_ENG_ID) { + if (engine_id[1] != HL_RAZWI_NA_ENG_ID) + num_of_razwi_eng = 2; + else + num_of_razwi_eng = 1; } + + hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags, + event_mask); } } @@ -7350,8 +7347,8 @@ static void gaudi_print_out_of_sync_info(struct hl_device *hdev, { struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ]; - dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", - sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } static void gaudi_print_fw_alive_info(struct hl_device *hdev, @@ -7359,9 +7356,10 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev, { dev_err(hdev->dev, "FW alive report: severity=%s, process_id=%u, thread_id=%u, uptime=%llu seconds\n", - (fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ? - "Minor" : "Critical", fw_alive->process_id, - fw_alive->thread_id, fw_alive->uptime_seconds); + (fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ? "Minor" : "Critical", + le32_to_cpu(fw_alive->process_id), + le32_to_cpu(fw_alive->thread_id), + le64_to_cpu(fw_alive->uptime_seconds)); } static void gaudi_print_nic_axi_irq_info(struct hl_device *hdev, u16 event_type, @@ -7679,7 +7677,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR: case GAUDI_EVENT_MMU_DERR: case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; @@ -7689,7 +7687,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_AXI_ECC: case GAUDI_EVENT_L2_RAM_ECC: case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; @@ -7698,7 +7696,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM1_SPI_0: case GAUDI_EVENT_HBM2_SPI_0: case GAUDI_EVENT_HBM3_SPI_0: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_hbm_read_interrupts(hdev, gaudi_hbm_event_to_dev(event_type), &eq_entry->hbm_ecc_data); @@ -7710,7 +7708,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM1_SPI_1: case GAUDI_EVENT_HBM2_SPI_1: case GAUDI_EVENT_HBM3_SPI_1: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_hbm_read_interrupts(hdev, gaudi_hbm_event_to_dev(event_type), &eq_entry->hbm_ecc_data); @@ -7732,7 +7730,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr * if the event is a TPC Assertion or a "real" TPC DEC. */ event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT; - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); reset_required = gaudi_tpc_read_interrupts(hdev, tpc_dec_event_to_tpc_id(event_type), "AXI_SLV_DEC_Error"); @@ -7757,7 +7755,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_TPC5_KRN_ERR: case GAUDI_EVENT_TPC6_KRN_ERR: case GAUDI_EVENT_TPC7_KRN_ERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); reset_required = gaudi_tpc_read_interrupts(hdev, tpc_krn_event_to_tpc_id(event_type), "KRN_ERR"); @@ -7796,7 +7794,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR: fallthrough; case GAUDI_EVENT_MMU_SERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; @@ -7806,14 +7804,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_CPU_AXI_SPLITTER: case GAUDI_EVENT_PSOC_AXI_DEC: case GAUDI_EVENT_PSOC_PRSTN_FALL: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI_EVENT_MMU_PAGE_FAULT: case GAUDI_EVENT_MMU_WR_PERM: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -7842,14 +7840,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_NIC4_QM1: case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE: case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_qman_err(hdev, event_type, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= (HL_NOTIFIER_EVENT_USER_ENGINE_ERR | HL_NOTIFIER_EVENT_DEVICE_RESET); break; case GAUDI_EVENT_RAZWI_OR_ADC_SW: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; goto reset_device; @@ -7862,7 +7860,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_TPC6_BMON_SPMU: case GAUDI_EVENT_TPC7_BMON_SPMU: case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -7874,7 +7872,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr break; case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_sm_sei_info(hdev, event_type, &eq_entry->sm_sei_data); rc = hl_state_dump(hdev); @@ -7903,18 +7901,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr break; case GAUDI_EVENT_DEV_RESET_REQ: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; case GAUDI_EVENT_FW_ALIVE_S: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; @@ -7946,14 +7944,14 @@ reset_device: reset_required = false; } - /* despite reset doesn't execute. a notification on - * occurred event needs to be sent here - */ - hl_notifier_event_send_all(hdev, event_mask); - if (reset_required) - hl_device_reset(hdev, flags); - else + if (reset_required) { + hl_device_cond_reset(hdev, flags, event_mask); + } else { hl_fw_unmask_irq(hdev, event_type); + /* Notification on occurred event needs to be sent although reset is not executed */ + if (event_mask) + hl_notifier_event_send_all(hdev, event_mask); + } } static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate, u32 *size) diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 65e6cae6100a..e793fb2bdcbe 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -128,6 +128,8 @@ #define GAUDI2_VDEC_MSIX_ENTRIES (GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM - \ GAUDI2_IRQ_NUM_DCORE0_DEC0_NRM + 1) +#define ENGINE_ID_DCORE_OFFSET (GAUDI2_DCORE1_ENGINE_ID_EDMA_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0) + enum hl_pmmu_fatal_cause { LATENCY_RD_OUT_FIFO_OVERRUN, LATENCY_WR_OUT_FIFO_OVERRUN, @@ -3966,11 +3968,7 @@ static void gaudi2_init_firmware_loader(struct hl_device *hdev) fw_loader->skip_bmc = false; fw_loader->sram_bar_id = SRAM_CFG_BAR_ID; fw_loader->dram_bar_id = DRAM_BAR_ID; - - if (hdev->asic_type == ASIC_GAUDI2 || hdev->asic_type == ASIC_GAUDI2_SEC) - fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC; - else /* ASIC_GAUDI2_FPGA */ - fw_loader->cpu_timeout = GAUDI2_FPGA_CPU_TIMEOUT; + fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC; /* here we update initial values for few specific dynamic regs (as * before reading the first descriptor from FW those value has to be @@ -4471,23 +4469,9 @@ static void gaudi2_init_sm(struct hl_device *hdev) reg_val = FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_MON_CONFIG_CQ_EN_MASK, 1); WREG32(mmDCORE0_SYNC_MNGR_OBJS_MON_CONFIG_0 + (4 * i), reg_val); - /* Init CQ0 DB */ - /* Configure the monitor to trigger MSI-X interrupt */ - /* TODO: - * Remove the if statement when virtual MSI-X doorbell is supported in simulator (SW-93022) - * and in F/W (SW-93024). - */ - if (!hdev->pdev || hdev->asic_prop.fw_security_enabled) { - u64 msix_db_reg = CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF; - - WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_L_0, lower_32_bits(msix_db_reg)); - WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_H_0, upper_32_bits(msix_db_reg)); - } else { - WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_L_0, - lower_32_bits(gaudi2->virt_msix_db_dma_addr)); - WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_H_0, - upper_32_bits(gaudi2->virt_msix_db_dma_addr)); - } + /* Init CQ0 DB - configure the monitor to trigger MSI-X interrupt */ + WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_L_0, lower_32_bits(gaudi2->virt_msix_db_dma_addr)); + WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_H_0, upper_32_bits(gaudi2->virt_msix_db_dma_addr)); WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_DATA_0, GAUDI2_IRQ_NUM_COMPLETION); for (i = 0 ; i < GAUDI2_RESERVED_CQ_NUMBER ; i++) { @@ -4535,7 +4519,7 @@ static void gaudi2_init_mme_acc(struct hl_device *hdev, u32 reg_base) static void gaudi2_init_dcore_mme(struct hl_device *hdev, int dcore_id, bool config_qman_only) { - u32 queue_id_base, reg_base, clk_en_addr = 0; + u32 queue_id_base, reg_base; switch (dcore_id) { case 0: @@ -4543,23 +4527,18 @@ static void gaudi2_init_dcore_mme(struct hl_device *hdev, int dcore_id, break; case 1: queue_id_base = GAUDI2_QUEUE_ID_DCORE1_MME_0_0; - clk_en_addr = mmDCORE1_MME_CTRL_LO_QM_SLV_CLK_EN; break; case 2: queue_id_base = GAUDI2_QUEUE_ID_DCORE2_MME_0_0; break; case 3: queue_id_base = GAUDI2_QUEUE_ID_DCORE3_MME_0_0; - clk_en_addr = mmDCORE3_MME_CTRL_LO_QM_SLV_CLK_EN; break; default: dev_err(hdev->dev, "Invalid dcore id %u\n", dcore_id); return; } - if (clk_en_addr && !(hdev->fw_components & FW_TYPE_BOOT_CPU)) - WREG32(clk_en_addr, 0x1); - if (!config_qman_only) { reg_base = gaudi2_mme_acc_blocks_bases[dcore_id]; gaudi2_init_mme_acc(hdev, reg_base); @@ -4660,20 +4639,6 @@ static void gaudi2_init_vdec_brdg_ctrl(struct hl_device *hdev, u64 base_addr, u3 { u32 sob_id; - /* TODO: - * Remove when virtual MSI-X doorbell is supported in simulator (SW-93022) and in F/W - * (SW-93024). - */ - if (!hdev->pdev || hdev->asic_prop.fw_security_enabled) { - u32 interrupt_id = GAUDI2_IRQ_NUM_DCORE0_DEC0_NRM + 2 * decoder_id; - - WREG32(base_addr + BRDG_CTRL_NRM_MSIX_LBW_AWADDR, mmPCIE_DBI_MSIX_DOORBELL_OFF); - WREG32(base_addr + BRDG_CTRL_NRM_MSIX_LBW_WDATA, interrupt_id); - WREG32(base_addr + BRDG_CTRL_ABNRM_MSIX_LBW_AWADDR, mmPCIE_DBI_MSIX_DOORBELL_OFF); - WREG32(base_addr + BRDG_CTRL_ABNRM_MSIX_LBW_WDATA, interrupt_id + 1); - return; - } - /* VCMD normal interrupt */ sob_id = GAUDI2_RESERVED_SOB_DEC_NRM_FIRST + decoder_id; WREG32(base_addr + BRDG_CTRL_NRM_MSIX_LBW_AWADDR, @@ -4730,30 +4695,6 @@ static void gaudi2_init_dec(struct hl_device *hdev) } } -static void gaudi2_init_msix_gw_table(struct hl_device *hdev) -{ - u32 first_reg_offset, last_reg_offset, msix_gw_table_base; - u8 first_bit, last_bit; - int i; - - msix_gw_table_base = mmPCIE_WRAP_MSIX_GW_TABLE_0; - first_reg_offset = (GAUDI2_IRQ_NUM_USER_FIRST >> 5) << 2; - first_bit = GAUDI2_IRQ_NUM_USER_FIRST % 32; - last_reg_offset = (GAUDI2_IRQ_NUM_USER_LAST >> 5) << 2; - last_bit = GAUDI2_IRQ_NUM_USER_LAST % 32; - - if (first_reg_offset == last_reg_offset) { - WREG32(msix_gw_table_base + first_reg_offset, GENMASK(last_bit, first_bit)); - return; - } - - WREG32(msix_gw_table_base + first_reg_offset, GENMASK(31, first_bit)); - WREG32(msix_gw_table_base + last_reg_offset, GENMASK(last_bit, 0)); - - for (i = first_reg_offset + 4; i < last_reg_offset ; i += 4) - WREG32(msix_gw_table_base + i, 0xFFFFFFFF); -} - static int gaudi2_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 stlb_base, u32 asid, u64 phys_addr) { @@ -5111,7 +5052,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev) mmu_base = mmPMMU_HBW_MMU_BASE; stlb_base = mmPMMU_HBW_STLB_BASE; - RMWREG32(stlb_base + STLB_HOP_CONFIGURATION_OFFSET, + RMWREG32_SHIFTED(stlb_base + STLB_HOP_CONFIGURATION_OFFSET, (0 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_HOP_SHIFT) | (5 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_SMALL_P_SHIFT) | (4 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_LARGE_P_SHIFT) | @@ -5127,7 +5068,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev) if (PAGE_SIZE == SZ_64K) { /* Set page sizes to 64K on hop5 and 16M on hop4 + enable 8 bit hops */ - RMWREG32(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET, + RMWREG32_SHIFTED(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET, FIELD_PREP(DCORE0_HMMU0_MMU_STATIC_MULTI_PAGE_SIZE_HOP5_PAGE_SIZE_MASK, 4) | FIELD_PREP(DCORE0_HMMU0_MMU_STATIC_MULTI_PAGE_SIZE_HOP4_PAGE_SIZE_MASK, 3) | FIELD_PREP( @@ -5175,7 +5116,7 @@ static int gaudi2_dcore_hmmu_init(struct hl_device *hdev, int dcore_id, RMWREG32(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET, 5 /* 64MB */, MMU_STATIC_MULTI_PAGE_SIZE_HOP4_PAGE_SIZE_MASK); - RMWREG32(stlb_base + STLB_HOP_CONFIGURATION_OFFSET, + RMWREG32_SHIFTED(stlb_base + STLB_HOP_CONFIGURATION_OFFSET, FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_HOP_MASK, 0) | FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_SMALL_P_MASK, 3) | FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_LARGE_P_MASK, 3) | @@ -5267,8 +5208,6 @@ static int gaudi2_hw_init(struct hl_device *hdev) return rc; } - gaudi2_init_msix_gw_table(hdev); - gaudi2_init_scrambler_hbm(hdev); gaudi2_init_kdma(hdev); @@ -6863,6 +6802,7 @@ static inline bool is_info_event(u32 event) { switch (event) { case GAUDI2_EVENT_CPU_CPLD_SHUTDOWN_CAUSE: + case GAUDI2_EVENT_CPU_FIX_POWER_ENV_S ... GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E: return true; default: return false; @@ -7097,9 +7037,12 @@ static void gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *q static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, u64 rtr_mstr_if_base_addr, bool is_write, char *name, - bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info) + bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info, + enum gaudi2_engine_id id, u64 *event_mask) { u32 razwi_hi, razwi_lo, razwi_xy; + u16 eng_id = id; + u8 rd_wr_flag; if (is_write) { if (read_razwi_regs) { @@ -7111,6 +7054,7 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, razwi_lo = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_lo_reg); razwi_xy = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_id_reg); } + rd_wr_flag = HL_RAZWI_WRITE; } else { if (read_razwi_regs) { razwi_hi = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HI); @@ -7121,8 +7065,12 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, razwi_lo = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_lo_reg); razwi_xy = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_id_reg); } + rd_wr_flag = HL_RAZWI_READ; } + hl_handle_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &eng_id, 1, + rd_wr_flag | HL_RAZWI_HBW, event_mask); + dev_err_ratelimited(hdev->dev, "%s-RAZWI SHARED RR HBW %s error, address %#llx, Initiator coordinates 0x%x\n", name, is_write ? "WR" : "RD", (u64)razwi_hi << 32 | razwi_lo, razwi_xy); @@ -7130,9 +7078,12 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, u64 rtr_mstr_if_base_addr, bool is_write, char *name, - bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info) + bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info, + enum gaudi2_engine_id id, u64 *event_mask) { u32 razwi_addr, razwi_xy; + u16 eng_id = id; + u8 rd_wr_flag; if (is_write) { if (read_razwi_regs) { @@ -7143,9 +7094,7 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, razwi_xy = le32_to_cpu(razwi_info->lbw.rr_aw_razwi_id_reg); } - dev_err_ratelimited(hdev->dev, - "%s-RAZWI SHARED RR LBW WR error, mstr_if 0x%llx, captured address 0x%x, Initiator coordinates 0x%x\n", - name, rtr_mstr_if_base_addr, razwi_addr, razwi_xy); + rd_wr_flag = HL_RAZWI_WRITE; } else { if (read_razwi_regs) { razwi_addr = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI); @@ -7155,9 +7104,57 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, razwi_xy = le32_to_cpu(razwi_info->lbw.rr_ar_razwi_id_reg); } - dev_err_ratelimited(hdev->dev, - "%s-RAZWI SHARED RR LBW AR error, mstr_if 0x%llx, captured address 0x%x Initiator coordinates 0x%x\n", - name, rtr_mstr_if_base_addr, razwi_addr, razwi_xy); + rd_wr_flag = HL_RAZWI_READ; + } + + hl_handle_razwi(hdev, razwi_addr, &eng_id, 1, rd_wr_flag | HL_RAZWI_LBW, event_mask); + dev_err_ratelimited(hdev->dev, + "%s-RAZWI SHARED RR LBW %s error, mstr_if 0x%llx, captured address 0x%x Initiator coordinates 0x%x\n", + name, is_write ? "WR" : "RD", rtr_mstr_if_base_addr, razwi_addr, + razwi_xy); +} + +static enum gaudi2_engine_id gaudi2_razwi_calc_engine_id(struct hl_device *hdev, + enum razwi_event_sources module, u8 module_idx) +{ + switch (module) { + case RAZWI_TPC: + if (module_idx == (NUM_OF_TPC_PER_DCORE * NUM_OF_DCORES)) + return GAUDI2_DCORE0_ENGINE_ID_TPC_6; + return (((module_idx / NUM_OF_TPC_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) + + (module_idx % NUM_OF_TPC_PER_DCORE) + + (GAUDI2_DCORE0_ENGINE_ID_TPC_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0)); + + case RAZWI_MME: + return ((GAUDI2_DCORE0_ENGINE_ID_MME - GAUDI2_DCORE0_ENGINE_ID_EDMA_0) + + (module_idx * ENGINE_ID_DCORE_OFFSET)); + + case RAZWI_EDMA: + return (((module_idx / NUM_OF_EDMA_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) + + (module_idx % NUM_OF_EDMA_PER_DCORE)); + + case RAZWI_PDMA: + return (GAUDI2_ENGINE_ID_PDMA_0 + module_idx); + + case RAZWI_NIC: + return (GAUDI2_ENGINE_ID_NIC0_0 + (NIC_NUMBER_OF_QM_PER_MACRO * module_idx)); + + case RAZWI_DEC: + if (module_idx == 8) + return GAUDI2_PCIE_ENGINE_ID_DEC_0; + + if (module_idx == 9) + return GAUDI2_PCIE_ENGINE_ID_DEC_1; + ; + return (((module_idx / NUM_OF_DEC_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) + + (module_idx % NUM_OF_DEC_PER_DCORE) + + (GAUDI2_DCORE0_ENGINE_ID_DEC_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0)); + + case RAZWI_ROT: + return GAUDI2_ENGINE_ID_ROT_0 + module_idx; + + default: + return GAUDI2_ENGINE_ID_SIZE; } } @@ -7167,10 +7164,11 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, */ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev, enum razwi_event_sources module, u8 module_idx, - u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info) + u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info, + u64 *event_mask) { bool via_sft = false, read_razwi_regs = false; - u32 rtr_id, dcore_id, dcore_rtr_id, sft_id; + u32 rtr_id, dcore_id, dcore_rtr_id, sft_id, eng_id; u64 rtr_mstr_if_base_addr; u32 hbw_shrd_aw = 0, hbw_shrd_ar = 0; u32 lbw_shrd_aw = 0, lbw_shrd_ar = 0; @@ -7304,9 +7302,11 @@ dump_info: if (!hbw_shrd_aw && !hbw_shrd_ar && !lbw_shrd_aw && !lbw_shrd_ar) return; + eng_id = gaudi2_razwi_calc_engine_id(hdev, module, module_idx); if (hbw_shrd_aw) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true, - initiator_name, read_razwi_regs, razwi_info); + initiator_name, read_razwi_regs, razwi_info, + eng_id, event_mask); /* Clear event indication */ if (read_razwi_regs) @@ -7315,7 +7315,8 @@ dump_info: if (hbw_shrd_ar) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false, - initiator_name, read_razwi_regs, razwi_info); + initiator_name, read_razwi_regs, razwi_info, + eng_id, event_mask); /* Clear event indication */ if (read_razwi_regs) @@ -7324,7 +7325,8 @@ dump_info: if (lbw_shrd_aw) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true, - initiator_name, read_razwi_regs, razwi_info); + initiator_name, read_razwi_regs, razwi_info, + eng_id, event_mask); /* Clear event indication */ if (read_razwi_regs) @@ -7333,7 +7335,8 @@ dump_info: if (lbw_shrd_ar) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false, - initiator_name, read_razwi_regs, razwi_info); + initiator_name, read_razwi_regs, razwi_info, + eng_id, event_mask); /* Clear event indication */ if (read_razwi_regs) @@ -7349,38 +7352,42 @@ static void gaudi2_check_if_razwi_happened(struct hl_device *hdev) /* check all TPCs */ for (mod_idx = 0 ; mod_idx < (NUM_OF_TPC_PER_DCORE * NUM_OF_DCORES + 1) ; mod_idx++) { if (prop->tpc_enabled_mask & BIT(mod_idx)) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, mod_idx, 0, NULL, + NULL); } /* check all MMEs */ for (mod_idx = 0 ; mod_idx < (NUM_OF_MME_PER_DCORE * NUM_OF_DCORES) ; mod_idx++) for (sub_mod = MME_WAP0 ; sub_mod < MME_INITIATORS_MAX ; sub_mod++) gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mod_idx, - sub_mod, NULL); + sub_mod, NULL, NULL); /* check all EDMAs */ for (mod_idx = 0 ; mod_idx < (NUM_OF_EDMA_PER_DCORE * NUM_OF_DCORES) ; mod_idx++) if (prop->edma_enabled_mask & BIT(mod_idx)) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_EDMA, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_EDMA, mod_idx, 0, NULL, + NULL); /* check all PDMAs */ for (mod_idx = 0 ; mod_idx < NUM_OF_PDMA ; mod_idx++) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, mod_idx, 0, NULL, + NULL); /* check all NICs */ for (mod_idx = 0 ; mod_idx < NIC_NUMBER_OF_PORTS ; mod_idx++) if (hdev->nic_ports_mask & BIT(mod_idx)) gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_NIC, mod_idx >> 1, 0, - NULL); + NULL, NULL); /* check all DECs */ for (mod_idx = 0 ; mod_idx < NUMBER_OF_DEC ; mod_idx++) if (prop->decoder_enabled_mask & BIT(mod_idx)) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, mod_idx, 0, NULL, + NULL); /* check all ROTs */ for (mod_idx = 0 ; mod_idx < NUM_OF_ROT ; mod_idx++) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, mod_idx, 0, NULL, NULL); } static const char *gaudi2_get_initiators_name(u32 rtr_id) @@ -7455,25 +7462,176 @@ static const char *gaudi2_get_initiators_name(u32 rtr_id) } } +static u16 gaudi2_get_razwi_initiators(u32 rtr_id, u16 *engines) +{ + switch (rtr_id) { + case DCORE0_RTR0: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_DEC_0; + engines[1] = GAUDI2_DCORE0_ENGINE_ID_DEC_1; + engines[2] = GAUDI2_PCIE_ENGINE_ID_DEC_0; + engines[3] = GAUDI2_PCIE_ENGINE_ID_DEC_1; + engines[4] = GAUDI2_DCORE0_ENGINE_ID_TPC_6; + engines[5] = GAUDI2_ENGINE_ID_PDMA_0; + engines[6] = GAUDI2_ENGINE_ID_PDMA_1; + engines[7] = GAUDI2_ENGINE_ID_PCIE; + engines[8] = GAUDI2_DCORE0_ENGINE_ID_EDMA_0; + engines[9] = GAUDI2_DCORE1_ENGINE_ID_EDMA_0; + engines[10] = GAUDI2_ENGINE_ID_PSOC; + return 11; + + case DCORE0_RTR1: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_0; + engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_1; + return 2; + + case DCORE0_RTR2: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_2; + engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_3; + return 2; + + case DCORE0_RTR3: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_4; + engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_5; + return 2; + + case DCORE0_RTR4: + case DCORE0_RTR5: + case DCORE0_RTR6: + case DCORE0_RTR7: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_MME; + return 1; + + case DCORE1_RTR0: + case DCORE1_RTR1: + case DCORE1_RTR2: + case DCORE1_RTR3: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_MME; + return 1; + + case DCORE1_RTR4: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_4; + engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_5; + return 2; + + case DCORE1_RTR5: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_2; + engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_3; + return 2; + + case DCORE1_RTR6: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_0; + engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_1; + return 2; + + case DCORE1_RTR7: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_DEC_0; + engines[1] = GAUDI2_DCORE1_ENGINE_ID_DEC_1; + engines[2] = GAUDI2_ENGINE_ID_NIC0_0; + engines[3] = GAUDI2_ENGINE_ID_NIC1_0; + engines[4] = GAUDI2_ENGINE_ID_NIC2_0; + engines[5] = GAUDI2_ENGINE_ID_NIC3_0; + engines[6] = GAUDI2_ENGINE_ID_NIC4_0; + engines[7] = GAUDI2_ENGINE_ID_ARC_FARM; + engines[8] = GAUDI2_ENGINE_ID_KDMA; + engines[9] = GAUDI2_DCORE0_ENGINE_ID_EDMA_1; + engines[10] = GAUDI2_DCORE1_ENGINE_ID_EDMA_1; + return 11; + + case DCORE2_RTR0: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_DEC_0; + engines[1] = GAUDI2_DCORE2_ENGINE_ID_DEC_1; + engines[2] = GAUDI2_ENGINE_ID_NIC5_0; + engines[3] = GAUDI2_ENGINE_ID_NIC6_0; + engines[4] = GAUDI2_ENGINE_ID_NIC7_0; + engines[5] = GAUDI2_ENGINE_ID_NIC8_0; + engines[6] = GAUDI2_DCORE2_ENGINE_ID_EDMA_0; + engines[7] = GAUDI2_DCORE3_ENGINE_ID_EDMA_0; + engines[8] = GAUDI2_ENGINE_ID_ROT_0; + return 9; + + case DCORE2_RTR1: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_4; + engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_5; + return 2; + + case DCORE2_RTR2: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_2; + engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_3; + return 2; + + case DCORE2_RTR3: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_0; + engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_1; + return 2; + + case DCORE2_RTR4: + case DCORE2_RTR5: + case DCORE2_RTR6: + case DCORE2_RTR7: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_MME; + return 1; + case DCORE3_RTR0: + case DCORE3_RTR1: + case DCORE3_RTR2: + case DCORE3_RTR3: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_MME; + return 1; + case DCORE3_RTR4: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_0; + engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_1; + return 2; + case DCORE3_RTR5: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_2; + engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_3; + return 2; + case DCORE3_RTR6: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_4; + engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_5; + return 2; + case DCORE3_RTR7: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_DEC_0; + engines[1] = GAUDI2_DCORE3_ENGINE_ID_DEC_1; + engines[2] = GAUDI2_ENGINE_ID_NIC9_0; + engines[3] = GAUDI2_ENGINE_ID_NIC10_0; + engines[4] = GAUDI2_ENGINE_ID_NIC11_0; + engines[5] = GAUDI2_DCORE2_ENGINE_ID_EDMA_1; + engines[6] = GAUDI2_DCORE3_ENGINE_ID_EDMA_1; + engines[7] = GAUDI2_ENGINE_ID_ROT_1; + engines[8] = GAUDI2_ENGINE_ID_ROT_0; + return 9; + default: + return 0; + } +} + static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u32 rtr_id, - u64 rtr_ctrl_base_addr, bool is_write) + u64 rtr_ctrl_base_addr, bool is_write, + u64 *event_mask) { + u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng; u32 razwi_hi, razwi_lo; + u8 rd_wr_flag; + + num_of_eng = gaudi2_get_razwi_initiators(rtr_id, &engines[0]); if (is_write) { razwi_hi = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_ADDR_HI); razwi_lo = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_ADDR_LO); + rd_wr_flag = HL_RAZWI_WRITE; /* Clear set indication */ WREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_SET, 0x1); } else { razwi_hi = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_ADDR_HI); razwi_lo = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_ADDR_LO); + rd_wr_flag = HL_RAZWI_READ; /* Clear set indication */ WREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_SET, 0x1); } + hl_handle_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &engines[0], num_of_eng, + rd_wr_flag | HL_RAZWI_HBW, event_mask); dev_err_ratelimited(hdev->dev, "RAZWI PSOC unmapped HBW %s error, rtr id %u, address %#llx\n", is_write ? "WR" : "RD", rtr_id, (u64)razwi_hi << 32 | razwi_lo); @@ -7483,22 +7641,31 @@ static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u } static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u32 rtr_id, - u64 rtr_ctrl_base_addr, bool is_write) + u64 rtr_ctrl_base_addr, bool is_write, + u64 *event_mask) { + u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng; u32 razwi_addr; + u8 rd_wr_flag; + + num_of_eng = gaudi2_get_razwi_initiators(rtr_id, &engines[0]); if (is_write) { razwi_addr = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AW_ADDR); + rd_wr_flag = HL_RAZWI_WRITE; /* Clear set indication */ WREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AW_SET, 0x1); } else { razwi_addr = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AR_ADDR); + rd_wr_flag = HL_RAZWI_READ; /* Clear set indication */ WREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AR_SET, 0x1); } + hl_handle_razwi(hdev, razwi_addr, &engines[0], num_of_eng, rd_wr_flag | HL_RAZWI_LBW, + event_mask); dev_err_ratelimited(hdev->dev, "RAZWI PSOC unmapped LBW %s error, rtr id %u, address %#x\n", is_write ? "WR" : "RD", rtr_id, razwi_addr); @@ -7508,7 +7675,7 @@ static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u } /* PSOC RAZWI interrupt occurs only when trying to access a bad address */ -static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev) +static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev, u64 *event_mask) { u32 hbw_aw_set, hbw_ar_set, lbw_aw_set, lbw_ar_set, rtr_id, dcore_id, dcore_rtr_id, xy, razwi_mask_info, razwi_intr = 0; @@ -7562,19 +7729,19 @@ static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev) if (hbw_aw_set) gaudi2_razwi_unmapped_addr_hbw_printf_info(hdev, rtr_id, - rtr_ctrl_base_addr, true); + rtr_ctrl_base_addr, true, event_mask); if (hbw_ar_set) gaudi2_razwi_unmapped_addr_hbw_printf_info(hdev, rtr_id, - rtr_ctrl_base_addr, false); + rtr_ctrl_base_addr, false, event_mask); if (lbw_aw_set) gaudi2_razwi_unmapped_addr_lbw_printf_info(hdev, rtr_id, - rtr_ctrl_base_addr, true); + rtr_ctrl_base_addr, true, event_mask); if (lbw_ar_set) gaudi2_razwi_unmapped_addr_lbw_printf_info(hdev, rtr_id, - rtr_ctrl_base_addr, false); + rtr_ctrl_base_addr, false, event_mask); clear: /* Clear Interrupts only on pldm or if f/w doesn't handle interrupts */ @@ -7600,8 +7767,9 @@ static void _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base) } static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, - struct hl_eq_razwi_info *razwi_info) + struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { + enum razwi_event_sources module; u64 qman_base; u8 index; @@ -7611,9 +7779,11 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, qman_base = mmDCORE0_TPC0_QM_BASE + (index / NUM_OF_TPC_PER_DCORE) * DCORE_OFFSET + (index % NUM_OF_TPC_PER_DCORE) * DCORE_TPC_OFFSET; + module = RAZWI_TPC; break; case GAUDI2_EVENT_TPC24_AXI_ERR_RSP: qman_base = mmDCORE0_TPC6_QM_BASE; + module = RAZWI_TPC; break; case GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE: case GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE: @@ -7623,16 +7793,19 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, (GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE - GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE); qman_base = mmDCORE0_MME_QM_BASE + index * DCORE_OFFSET; + module = RAZWI_MME; break; case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP: case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP: index = event_type - GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP; qman_base = mmPDMA0_QM_BASE + index * PDMA_OFFSET; + module = RAZWI_PDMA; break; case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE: case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE: index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE; qman_base = mmROT0_QM_BASE + index * ROT_OFFSET; + module = RAZWI_ROT; break; default: return; @@ -7647,7 +7820,7 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, /* check if RAZWI happened */ if (razwi_info) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, 0, 0, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, razwi_info, event_mask); } static void gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type) @@ -7813,7 +7986,8 @@ static void gaudi2_handle_cpu_sei_err(struct hl_device *hdev) } static void gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, - struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause) + struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause, + u64 *event_mask) { u64 intr_cause_data = le64_to_cpu(razwi_with_intr_cause->intr_cause.intr_cause_data); int i; @@ -7825,11 +7999,12 @@ static void gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, /* check if RAZWI happened */ gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, rot_index, 0, - &razwi_with_intr_cause->razwi_info); + &razwi_with_intr_cause->razwi_info, event_mask); } static void gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char *interrupt_name, - struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause) + struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause, + u64 *event_mask) { u64 intr_cause_data = le64_to_cpu(razwi_with_intr_cause->intr_cause.intr_cause_data); int i; @@ -7841,11 +8016,11 @@ static void gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char /* check if RAZWI happened */ gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, tpc_index, 0, - &razwi_with_intr_cause->razwi_info); + &razwi_with_intr_cause->razwi_info, event_mask); } static void gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const char *interrupt_name, - struct hl_eq_razwi_info *razwi_info) + struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_val = 0; int i; @@ -7871,14 +8046,15 @@ static void gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const ch } /* check if RAZWI happened */ - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, dec_index, 0, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, dec_index, 0, razwi_info, + event_mask); /* Write 1 clear errors */ WREG32(sts_addr, sts_clr_val); } static void gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const char *interrupt_name, - struct hl_eq_razwi_info *razwi_info) + struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0; int i; @@ -7898,7 +8074,8 @@ static void gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const ch /* check if RAZWI happened */ for (i = MME_WRITE ; i < MME_INITIATORS_MAX ; i++) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, i, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, i, razwi_info, + event_mask); WREG32(sts_clr_addr, sts_clr_val); } @@ -7915,7 +8092,7 @@ static void gaudi2_handle_mme_sbte_err(struct hl_device *hdev, u8 mme_index, u8 } static void gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, - struct hl_eq_razwi_info *razwi_info) + struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0; int i; @@ -7935,8 +8112,10 @@ static void gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, } /* check if RAZWI happened on WAP0/1 */ - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP0, razwi_info); - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP1, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP0, razwi_info, + event_mask); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP1, razwi_info, + event_mask); WREG32(sts_clr_addr, sts_clr_val); } @@ -7966,40 +8145,41 @@ static void gaudi2_handle_dma_core_event(struct hl_device *hdev, u64 intr_cause_ gaudi2_dma_core_interrupts_cause[i]); } -static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev) +static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev, u64 *event_mask) { u32 mstr_if_base_addr = mmPCIE_MSTR_RR_MSTR_IF_RR_SHRD_HBW_BASE, razwi_happened_addr; razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true, - NULL); + NULL, GAUDI2_ENGINE_ID_PCIE, event_mask); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true, - NULL); + NULL, GAUDI2_ENGINE_ID_PCIE, event_mask); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true, - NULL); + NULL, GAUDI2_ENGINE_ID_PCIE, event_mask); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true, - NULL); + NULL, GAUDI2_ENGINE_ID_PCIE, event_mask); WREG32(razwi_happened_addr, 0x1); } } -static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cause_data) +static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cause_data, + u64 *event_mask) { int i; @@ -8014,7 +8194,7 @@ static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cau case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK: break; case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK: - gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev); + gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask); break; } } @@ -8047,7 +8227,8 @@ static void gaudi2_handle_hif_fatal(struct hl_device *hdev, u16 event_type, u64 } } -static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu) +static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu, + u64 *event_mask) { u32 valid, val; u64 addr; @@ -8064,6 +8245,7 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n", is_pmmu ? "PMMU" : "HMMU", addr); + hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask); WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE), 0); } @@ -8089,7 +8271,7 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo } static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char *mmu_name, - u64 mmu_base, bool is_pmmu) + u64 mmu_base, bool is_pmmu, u64 *event_mask) { u32 spi_sei_cause, interrupt_clr = 0x0; int i; @@ -8102,7 +8284,7 @@ static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char mmu_name, gaudi2_mmu_spi_sei[i].cause); if (i == 0) - gaudi2_handle_page_error(hdev, mmu_base, is_pmmu); + gaudi2_handle_page_error(hdev, mmu_base, is_pmmu, event_mask); else if (i == 1) gaudi2_handle_access_error(hdev, mmu_base, is_pmmu); @@ -8118,11 +8300,10 @@ static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char WREG32(mmu_base + MMU_INTERRUPT_CLR_OFFSET, interrupt_clr); } -static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) +static void gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) { u32 sei_cause_addr, sei_cause_val, sei_cause_cause, sei_cause_log; u32 cq_intr_addr, cq_intr_val, cq_intr_queue_index; - bool reset = true; int i; sei_cause_addr = mmDCORE0_SYNC_MNGR_GLBL_SM_SEI_CAUSE + DCORE_OFFSET * sm_index; @@ -8147,10 +8328,6 @@ static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) gaudi2_sm_sei_cause[i].cause_name, gaudi2_sm_sei_cause[i].log_name, sei_cause_log & gaudi2_sm_sei_cause[i].log_mask); - - /* Due to a potential H/W issue, do not reset upon BRESP errors */ - if (i == 2) - reset = false; break; } @@ -8170,11 +8347,9 @@ static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) /* Clear CQ_INTR */ WREG32(cq_intr_addr, 0); } - - return reset; } -static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type) +static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask) { bool is_pmmu = false; char desc[32]; @@ -8232,7 +8407,7 @@ static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type return; } - gaudi2_handle_mmu_spi_sei_generic(hdev, desc, mmu_base, is_pmmu); + gaudi2_handle_mmu_spi_sei_generic(hdev, desc, mmu_base, is_pmmu, event_mask); } @@ -8476,8 +8651,8 @@ static void gaudi2_print_out_of_sync_info(struct hl_device *hdev, { struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ]; - dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", - sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } static void gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev) @@ -8543,8 +8718,8 @@ static void gaudi2_print_cpu_pkt_failure_info(struct hl_device *hdev, struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ]; dev_warn(hdev->dev, - "FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", - sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); + "FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } static void hl_arc_event_handle(struct hl_device *hdev, @@ -8573,9 +8748,9 @@ static void hl_arc_event_handle(struct hl_device *hdev, static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) { - u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY; struct gaudi2_device *gaudi2 = hdev->asic_specific; - bool reset_required = false, skip_reset = false; + bool reset_required = false, is_critical = false; + u32 ctl, reset_flags = HL_DRV_RESET_HARD; int index, sbte_index; u64 event_mask = 0; u16 event_type; @@ -8601,6 +8776,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); + is_critical = eq_entry->ecc_data.is_critical; break; case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_PDMA1_QM: @@ -8626,29 +8802,30 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP: case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP: reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; - gaudi2_handle_qm_sei_err(hdev, event_type, &eq_entry->razwi_info); + gaudi2_handle_qm_sei_err(hdev, event_type, &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE: case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE: index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE; - gaudi2_handle_rot_err(hdev, index, &eq_entry->razwi_with_intr_cause); - gaudi2_handle_qm_sei_err(hdev, event_type, NULL); + gaudi2_handle_rot_err(hdev, index, &eq_entry->razwi_with_intr_cause, &event_mask); + gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP: index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP; gaudi2_tpc_ack_interrupts(hdev, index, "AXI_ERR_RSP", - &eq_entry->razwi_with_intr_cause); - gaudi2_handle_qm_sei_err(hdev, event_type, NULL); + &eq_entry->razwi_with_intr_cause, &event_mask); + gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE: index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE; - gaudi2_handle_dec_err(hdev, index, "AXI_ERR_RESPONSE", &eq_entry->razwi_info); + gaudi2_handle_dec_err(hdev, index, "AXI_ERR_RESPONSE", &eq_entry->razwi_info, + &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8679,7 +8856,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_TPC24_KERNEL_ERR: index = (event_type - GAUDI2_EVENT_TPC0_KERNEL_ERR) / (GAUDI2_EVENT_TPC1_KERNEL_ERR - GAUDI2_EVENT_TPC0_KERNEL_ERR); - gaudi2_tpc_ack_interrupts(hdev, index, "KRN_ERR", &eq_entry->razwi_with_intr_cause); + gaudi2_tpc_ack_interrupts(hdev, index, "KRN_ERR", &eq_entry->razwi_with_intr_cause, + &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8695,7 +8873,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_DEC9_SPI: index = (event_type - GAUDI2_EVENT_DEC0_SPI) / (GAUDI2_EVENT_DEC1_SPI - GAUDI2_EVENT_DEC0_SPI); - gaudi2_handle_dec_err(hdev, index, "SPI", &eq_entry->razwi_info); + gaudi2_handle_dec_err(hdev, index, "SPI", &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8707,8 +8885,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent (GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE - GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE); gaudi2_handle_mme_err(hdev, index, - "CTRL_AXI_ERROR_RESPONSE", &eq_entry->razwi_info); - gaudi2_handle_qm_sei_err(hdev, event_type, NULL); + "CTRL_AXI_ERROR_RESPONSE", &eq_entry->razwi_info, &event_mask); + gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8719,7 +8897,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent index = (event_type - GAUDI2_EVENT_MME0_QMAN_SW_ERROR) / (GAUDI2_EVENT_MME1_QMAN_SW_ERROR - GAUDI2_EVENT_MME0_QMAN_SW_ERROR); - gaudi2_handle_mme_err(hdev, index, "QMAN_SW_ERROR", &eq_entry->razwi_info); + gaudi2_handle_mme_err(hdev, index, "QMAN_SW_ERROR", &eq_entry->razwi_info, + &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8730,7 +8909,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent index = (event_type - GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID) / (GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID - GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID); - gaudi2_handle_mme_wap_err(hdev, index, &eq_entry->razwi_info); + gaudi2_handle_mme_wap_err(hdev, index, &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8749,7 +8928,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_PCIE_ADDR_DEC_ERR: gaudi2_print_pcie_addr_dec_info(hdev, - le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); + le64_to_cpu(eq_entry->intr_cause.intr_cause_data), &event_mask); reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -8758,7 +8937,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR: case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0: - gaudi2_handle_mmu_spi_sei_err(hdev, event_type); + gaudi2_handle_mmu_spi_sei_err(hdev, event_type, &event_mask); reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8778,7 +8957,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_PSOC63_RAZWI_OR_PID_MIN_MAX_INTERRUPT: - gaudi2_ack_psoc_razwi_event_handler(hdev); + gaudi2_ack_psoc_razwi_event_handler(hdev, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8927,7 +9106,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_SM3_AXI_ERROR_RESPONSE: index = event_type - GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE; - skip_reset = !gaudi2_handle_sm_err(hdev, index); + gaudi2_handle_sm_err(hdev, index); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8956,13 +9135,20 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; + case GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED: + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + is_critical = true; + break; + default: if (gaudi2_irq_map_table[event_type].valid) dev_err_ratelimited(hdev->dev, "Cannot find handler for event %d\n", event_type); } - if ((gaudi2_irq_map_table[event_type].reset || reset_required) && !skip_reset) + if ((gaudi2_irq_map_table[event_type].reset || reset_required) && + (hdev->hard_reset_on_fw_events || + (hdev->asic_prop.fw_security_enabled && is_critical))) goto reset_device; /* Send unmask irq only for interrupts not classified as MSG */ @@ -8975,46 +9161,84 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent return; reset_device: - if (hdev->hard_reset_on_fw_events) { - hl_device_reset(hdev, reset_flags); - event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + if (hdev->asic_prop.fw_security_enabled && is_critical) { + reset_flags |= HL_DRV_RESET_BYPASS_REQ_TO_FW; + event_mask |= HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; } else { - if (!gaudi2_irq_map_table[event_type].msg) - hl_fw_unmask_irq(hdev, event_type); + reset_flags |= HL_DRV_RESET_DELAY; } + event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + hl_device_cond_reset(hdev, reset_flags, event_mask); +} - if (event_mask) - hl_notifier_event_send_all(hdev, event_mask); +static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev, + struct packet_lin_dma *lin_dma_pkt, dma_addr_t pkt_dma_addr, + u32 hw_queue_id, u32 size, u64 addr, u32 val) +{ + u32 ctl, pkt_size; + int rc = 0; + + ctl = FIELD_PREP(GAUDI2_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA); + ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_MEMSET_MASK, 1); + ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_WRCOMP_MASK, 1); + ctl |= FIELD_PREP(GAUDI2_PKT_CTL_EB_MASK, 1); + + lin_dma_pkt->ctl = cpu_to_le32(ctl); + lin_dma_pkt->src_addr = cpu_to_le64(val); + lin_dma_pkt->dst_addr = cpu_to_le64(addr); + lin_dma_pkt->tsize = cpu_to_le32(size); + + pkt_size = sizeof(struct packet_lin_dma); + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr); + if (rc) + dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n", + hw_queue_id); + + return rc; } static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, u64 val) { - struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 edma_queues_id[] = {GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0, + GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0, + GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0, + GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0}; + u32 chunk_size, dcore, edma_idx, sob_offset, sob_addr, comp_val, + old_mmubp, mmubp, num_of_pkts, busy, pkt_size; u64 comp_addr, cur_addr = addr, end_addr = addr + size; - u32 chunk_size, busy, dcore, edma_idx, sob_offset, sob_addr, comp_val, edma_commit; - u32 old_mmubp, mmubp; - int rc = 0; + struct asic_fixed_properties *prop = &hdev->asic_prop; + void *lin_dma_pkts_arr; + dma_addr_t pkt_dma_addr; + int rc = 0, dma_num = 0; + + if (prop->edma_enabled_mask == 0) { + dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n"); + return -EIO; + } sob_offset = hdev->asic_prop.first_available_user_sob[0] * 4; sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset; comp_addr = CFG_BASE + sob_addr; comp_val = FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1) | FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1); - - edma_commit = FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_LIN_MASK, 1) | - FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_MEM_SET_MASK, 1) | - FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_WR_COMP_EN_MASK, 1); mmubp = FIELD_PREP(ARC_FARM_KDMA_CTX_AXUSER_HB_MMU_BP_WR_MASK, 1) | FIELD_PREP(ARC_FARM_KDMA_CTX_AXUSER_HB_MMU_BP_RD_MASK, 1); - if (prop->edma_enabled_mask == 0) { - dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n"); - return -EIO; - } + /* Calculate how many lin dma pkts we'll need */ + num_of_pkts = div64_u64(round_up(size, SZ_2G), SZ_2G); + pkt_size = sizeof(struct packet_lin_dma); + + lin_dma_pkts_arr = hl_asic_dma_alloc_coherent(hdev, pkt_size * num_of_pkts, + &pkt_dma_addr, GFP_KERNEL); + if (!lin_dma_pkts_arr) + return -ENOMEM; /* * set mmu bypass for the scrubbing - all ddmas are configured the same so save * only the first one to restore later + * also set the sob addr for all edma cores for completion. + * set QM as trusted to allow it to access physical address with MMU bp. */ old_mmubp = RREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP); for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) { @@ -9027,17 +9251,22 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz WREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP + edma_offset, mmubp); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset, + lower_32_bits(comp_addr)); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset, + upper_32_bits(comp_addr)); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset, + comp_val); + gaudi2_qman_set_test_mode(hdev, + edma_queues_id[dcore] + 4 * edma_idx, true); } } - while (cur_addr < end_addr) { - int dma_num = 0; + WREG32(sob_addr, 0); - WREG32(sob_addr, 0); + while (cur_addr < end_addr) { for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) { for (edma_idx = 0 ; edma_idx < NUM_OF_EDMA_PER_DCORE ; edma_idx++) { - u32 edma_offset = dcore * DCORE_OFFSET + - edma_idx * DCORE_EDMA_OFFSET; u32 edma_bit = dcore * NUM_OF_EDMA_PER_DCORE + edma_idx; if (!(prop->edma_enabled_mask & BIT(edma_bit))) @@ -9045,41 +9274,26 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz chunk_size = min_t(u64, SZ_2G, end_addr - cur_addr); - WREG32(mmDCORE0_EDMA0_CORE_CTX_SRC_BASE_LO + edma_offset, - lower_32_bits(val)); - WREG32(mmDCORE0_EDMA0_CORE_CTX_SRC_BASE_HI + edma_offset, - upper_32_bits(val)); - - WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_BASE_LO + edma_offset, - lower_32_bits(cur_addr)); - WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_BASE_HI + edma_offset, - upper_32_bits(cur_addr)); - - WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset, - lower_32_bits(comp_addr)); - WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset, - upper_32_bits(comp_addr)); - WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset, - comp_val); - - WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_TSIZE_0 + edma_offset, - chunk_size); - WREG32(mmDCORE0_EDMA0_CORE_CTX_COMMIT + edma_offset, edma_commit); + rc = gaudi2_memset_memory_chunk_using_edma_qm(hdev, + (struct packet_lin_dma *)lin_dma_pkts_arr + dma_num, + pkt_dma_addr + dma_num * pkt_size, + edma_queues_id[dcore] + edma_idx * 4, + chunk_size, cur_addr, val); + if (rc) + goto end; dma_num++; - cur_addr += chunk_size; - if (cur_addr == end_addr) - goto poll; + break; } } -poll: - rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000); - if (rc) { - dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n"); - goto end; - } + } + + rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000); + if (rc) { + dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n"); + goto end; } end: for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) { @@ -9091,10 +9305,17 @@ end: continue; WREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP + edma_offset, old_mmubp); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset, 0); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset, 0); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset, 0); + gaudi2_qman_set_test_mode(hdev, + edma_queues_id[dcore] + 4 * edma_idx, false); } } WREG32(sob_addr, 0); + hl_asic_dma_free_coherent(hdev, pkt_size * num_of_pkts, lin_dma_pkts_arr, pkt_dma_addr); + return rc; } @@ -9165,6 +9386,7 @@ static void gaudi2_restore_user_sm_registers(struct hl_device *hdev) gaudi2_memset_device_lbw(hdev, cq_lbw_data_addr, size, 0); gaudi2_memset_device_lbw(hdev, cq_base_l_addr, size, 0); gaudi2_memset_device_lbw(hdev, cq_base_h_addr, size, 0); + gaudi2_memset_device_lbw(hdev, cq_size_addr, size, 0); cq_lbw_l_addr = mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_L_0 + DCORE_OFFSET; cq_lbw_h_addr = mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_H_0 + DCORE_OFFSET; @@ -9990,7 +10212,7 @@ static void gaudi2_ack_mmu_error(struct hl_device *hdev, u64 mmu_id) if (gaudi2_get_mmu_base(hdev, mmu_id, &mmu_base)) return; - gaudi2_handle_page_error(hdev, mmu_base, is_pmmu); + gaudi2_handle_page_error(hdev, mmu_base, is_pmmu, NULL); gaudi2_handle_access_error(hdev, mmu_base, is_pmmu); } @@ -10141,10 +10363,9 @@ int gaudi2_send_device_activity(struct hl_device *hdev, bool open) { struct gaudi2_device *gaudi2 = hdev->asic_specific; - if (!(gaudi2->hw_cap_initialized & HW_CAP_CPU_Q) || hdev->fw_major_version < 37) + if (!(gaudi2->hw_cap_initialized & HW_CAP_CPU_Q)) return 0; - /* TODO: add check for FW version using minor ver once it's known */ return hl_fw_send_device_activity(hdev, open); } diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2P.h b/drivers/misc/habanalabs/gaudi2/gaudi2P.h index a99c348bbf39..b4383c199bbb 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2P.h +++ b/drivers/misc/habanalabs/gaudi2/gaudi2P.h @@ -23,8 +23,6 @@ #define GAUDI2_CPU_TIMEOUT_USEC 30000000 /* 30s */ -#define GAUDI2_FPGA_CPU_TIMEOUT 100000000 /* 100s */ - #define NUMBER_OF_PDMA_QUEUES 2 #define NUMBER_OF_EDMA_QUEUES 8 #define NUMBER_OF_MME_QUEUES 4 diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2_security.c b/drivers/misc/habanalabs/gaudi2/gaudi2_security.c index c6906fb14229..768c2f3dc900 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2_security.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2_security.c @@ -1764,6 +1764,7 @@ static const struct range gaudi2_pb_nic0_qm_arc_aux0_unsecured_regs[] = { {mmNIC0_QM_ARC_AUX0_CLUSTER_NUM, mmNIC0_QM_ARC_AUX0_WAKE_UP_EVENT}, {mmNIC0_QM_ARC_AUX0_ARC_RST_REQ, mmNIC0_QM_ARC_AUX0_CID_OFFSET_7}, {mmNIC0_QM_ARC_AUX0_SCRATCHPAD_0, mmNIC0_QM_ARC_AUX0_INFLIGHT_LBU_RD_CNT}, + {mmNIC0_QM_ARC_AUX0_CBU_EARLY_BRESP_EN, mmNIC0_QM_ARC_AUX0_CBU_EARLY_BRESP_EN}, {mmNIC0_QM_ARC_AUX0_LBU_EARLY_BRESP_EN, mmNIC0_QM_ARC_AUX0_LBU_EARLY_BRESP_EN}, {mmNIC0_QM_ARC_AUX0_DCCM_QUEUE_BASE_ADDR_0, mmNIC0_QM_ARC_AUX0_DCCM_QUEUE_ALERT_MSG}, {mmNIC0_QM_ARC_AUX0_DCCM_Q_PUSH_FIFO_CNT, mmNIC0_QM_ARC_AUX0_QMAN_ARC_CQ_SHADOW_CI}, diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 5ef9e3ca97a6..0f083fcf81a6 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4475,8 +4475,8 @@ static void goya_print_out_of_sync_info(struct hl_device *hdev, { struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ]; - dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", - sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } static void goya_print_irq_info(struct hl_device *hdev, u16 event_type, diff --git a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h index 34406770a76a..305b576222e6 100644 --- a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h +++ b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h @@ -957,6 +957,7 @@ enum gaudi2_async_event_id { GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG0 = 1317, GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1 = 1318, GAUDI2_EVENT_ARC_DCCM_FULL = 1319, + GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED = 1320, GAUDI2_EVENT_SIZE, }; diff --git a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h index 5bd4383c9f2c..d510cb10c883 100644 --- a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h +++ b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2018-2021 HabanaLabs, Ltd. + * Copyright 2018-2022 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -2663,6 +2663,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = { .msg = 1, .reset = 0, .name = "STATUS_NIC11_ENG1" }, { .fc_id = 1319, .cpu_id = 625, .valid = 1, .msg = 1, .reset = 0, .name = "ARC_DCCM_FULL" }, + { .fc_id = 1320, .cpu_id = 626, .valid = 1, + .msg = 1, .reset = 1, .name = "FP32_NOT_SUPPORTED" }, }; #endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */ diff --git a/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h b/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h index d232081d4e0f..f5d497dc9bdc 100644 --- a/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h +++ b/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h @@ -20,4 +20,11 @@ #define PCI_CONFIG_ELBI_STS_MASK (PCI_CONFIG_ELBI_STS_ERR | \ PCI_CONFIG_ELBI_STS_DONE) +enum hl_revision_id { + /* PCI revision ID 0 is not legal */ + REV_ID_INVALID = 0x00, + REV_ID_A = 0x01, + REV_ID_B = 0x02, +}; + #endif /* INCLUDE_PCI_GENERAL_H_ */ diff --git a/drivers/misc/hmc6352.c b/drivers/misc/hmc6352.c index 42b9adef28a3..8967940ecd1e 100644 --- a/drivers/misc/hmc6352.c +++ b/drivers/misc/hmc6352.c @@ -101,8 +101,7 @@ static const struct attribute_group m_compass_gr = { .attrs = mid_att_compass }; -static int hmc6352_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int hmc6352_probe(struct i2c_client *client) { int res; @@ -132,7 +131,7 @@ static struct i2c_driver hmc6352_driver = { .driver = { .name = "hmc6352", }, - .probe = hmc6352_probe, + .probe_new = hmc6352_probe, .remove = hmc6352_remove, .id_table = hmc6352_id, }; diff --git a/drivers/misc/ics932s401.c b/drivers/misc/ics932s401.c index 1cb71df966a4..12108a7b9b40 100644 --- a/drivers/misc/ics932s401.c +++ b/drivers/misc/ics932s401.c @@ -89,8 +89,7 @@ struct ics932s401_data { u8 regs[NUM_REGS]; }; -static int ics932s401_probe(struct i2c_client *client, - const struct i2c_device_id *id); +static int ics932s401_probe(struct i2c_client *client); static int ics932s401_detect(struct i2c_client *client, struct i2c_board_info *info); static void ics932s401_remove(struct i2c_client *client); @@ -106,7 +105,7 @@ static struct i2c_driver ics932s401_driver = { .driver = { .name = "ics932s401", }, - .probe = ics932s401_probe, + .probe_new = ics932s401_probe, .remove = ics932s401_remove, .id_table = ics932s401_id, .detect = ics932s401_detect, @@ -429,8 +428,7 @@ static int ics932s401_detect(struct i2c_client *client, return 0; } -static int ics932s401_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int ics932s401_probe(struct i2c_client *client) { struct ics932s401_data *data; int err; diff --git a/drivers/misc/isl29003.c b/drivers/misc/isl29003.c index 8ab61be79c76..aeda2fa89e61 100644 --- a/drivers/misc/isl29003.c +++ b/drivers/misc/isl29003.c @@ -374,8 +374,7 @@ static int isl29003_init_client(struct i2c_client *client) * I2C layer */ -static int isl29003_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int isl29003_probe(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; struct isl29003_data *data; @@ -460,7 +459,7 @@ static struct i2c_driver isl29003_driver = { .name = ISL29003_DRV_NAME, .pm = ISL29003_PM_OPS, }, - .probe = isl29003_probe, + .probe_new = isl29003_probe, .remove = isl29003_remove, .id_table = isl29003_id, }; diff --git a/drivers/misc/isl29020.c b/drivers/misc/isl29020.c index c6f2a94f501a..3be02093368c 100644 --- a/drivers/misc/isl29020.c +++ b/drivers/misc/isl29020.c @@ -151,8 +151,7 @@ static int als_set_default_config(struct i2c_client *client) return 0; } -static int isl29020_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int isl29020_probe(struct i2c_client *client) { int res; @@ -215,7 +214,7 @@ static struct i2c_driver isl29020_driver = { .name = "isl29020", .pm = ISL29020_PM_OPS, }, - .probe = isl29020_probe, + .probe_new = isl29020_probe, .remove = isl29020_remove, .id_table = isl29020_id, }; diff --git a/drivers/misc/lis3lv02d/lis3lv02d_i2c.c b/drivers/misc/lis3lv02d/lis3lv02d_i2c.c index d7daa01fe7ca..7071412d6bf6 100644 --- a/drivers/misc/lis3lv02d/lis3lv02d_i2c.c +++ b/drivers/misc/lis3lv02d/lis3lv02d_i2c.c @@ -100,8 +100,7 @@ static const struct of_device_id lis3lv02d_i2c_dt_ids[] = { MODULE_DEVICE_TABLE(of, lis3lv02d_i2c_dt_ids); #endif -static int lis3lv02d_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int lis3lv02d_i2c_probe(struct i2c_client *client) { int ret = 0; struct lis3lv02d_platform_data *pdata = client->dev.platform_data; @@ -263,7 +262,7 @@ static struct i2c_driver lis3lv02d_i2c_driver = { .pm = &lis3_pm_ops, .of_match_table = of_match_ptr(lis3lv02d_i2c_dt_ids), }, - .probe = lis3lv02d_i2c_probe, + .probe_new = lis3lv02d_i2c_probe, .remove = lis3lv02d_i2c_remove, .id_table = lis3lv02d_id, }; diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c index 71fbf0bc8453..6df7679d9739 100644 --- a/drivers/misc/mei/bus-fixup.c +++ b/drivers/misc/mei/bus-fixup.c @@ -188,17 +188,20 @@ static int mei_fwver(struct mei_cl_device *cldev) return ret; } +#define GFX_MEMORY_READY_TIMEOUT 200 /* timeout in milliseconds */ + static int mei_gfx_memory_ready(struct mei_cl_device *cldev) { struct mkhi_gfx_mem_ready req = {0}; - unsigned int mode = MEI_CL_IO_TX_INTERNAL; + unsigned int mode = MEI_CL_IO_TX_INTERNAL | MEI_CL_IO_TX_BLOCKING; req.hdr.group_id = MKHI_GROUP_ID_GFX; req.hdr.command = MKHI_GFX_MEMORY_READY_CMD_REQ; req.flags = MKHI_GFX_MEM_READY_PXP_ALLOWED; dev_dbg(&cldev->dev, "Sending memory ready command\n"); - return __mei_cl_send(cldev->cl, (u8 *)&req, sizeof(req), 0, mode); + return __mei_cl_send_timeout(cldev->cl, (u8 *)&req, sizeof(req), 0, + mode, GFX_MEMORY_READY_TIMEOUT); } static void mei_mkhi_fix(struct mei_cl_device *cldev) @@ -263,12 +266,13 @@ static void mei_gsc_mkhi_fix_ver(struct mei_cl_device *cldev) if (cldev->bus->pxp_mode == MEI_DEV_PXP_INIT) { ret = mei_gfx_memory_ready(cldev); - if (ret < 0) + if (ret < 0) { dev_err(&cldev->dev, "memory ready command failed %d\n", ret); - else + } else { dev_dbg(&cldev->dev, "memory ready command sent\n"); + cldev->bus->pxp_mode = MEI_DEV_PXP_SETUP; + } /* we go to reset after that */ - cldev->bus->pxp_mode = MEI_DEV_PXP_SETUP; goto out; } diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c index 1fbe127ff633..4a08b624910a 100644 --- a/drivers/misc/mei/bus.c +++ b/drivers/misc/mei/bus.c @@ -35,6 +35,26 @@ ssize_t __mei_cl_send(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, unsigned int mode) { + return __mei_cl_send_timeout(cl, buf, length, vtag, mode, MAX_SCHEDULE_TIMEOUT); +} + +/** + * __mei_cl_send_timeout - internal client send (write) + * + * @cl: host client + * @buf: buffer to send + * @length: buffer length + * @vtag: virtual tag + * @mode: sending mode + * @timeout: send timeout in milliseconds. + * effective only for blocking writes: the MEI_CL_IO_TX_BLOCKING mode bit is set. + * set timeout to the MAX_SCHEDULE_TIMEOUT to maixum allowed wait. + * + * Return: written size bytes or < 0 on error + */ +ssize_t __mei_cl_send_timeout(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, + unsigned int mode, unsigned long timeout) +{ struct mei_device *bus; struct mei_cl_cb *cb; ssize_t rets; @@ -108,7 +128,7 @@ ssize_t __mei_cl_send(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, cb->buf.size = 0; } - rets = mei_cl_write(cl, cb); + rets = mei_cl_write(cl, cb, timeout); if (mode & MEI_CL_IO_SGL && rets == 0) rets = length; diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c index 6c8b71ae32c8..9ddb854b8155 100644 --- a/drivers/misc/mei/client.c +++ b/drivers/misc/mei/client.c @@ -1954,10 +1954,13 @@ err: * * @cl: host client * @cb: write callback with filled data + * @timeout: send timeout in milliseconds. + * effective only for blocking writes: the cb->blocking is set. + * set timeout to the MAX_SCHEDULE_TIMEOUT to maixum allowed wait. * * Return: number of bytes sent on success, <0 on failure. */ -ssize_t mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb) +ssize_t mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb, unsigned long timeout) { struct mei_device *dev; struct mei_msg_data *buf; @@ -2081,11 +2084,20 @@ out: if (blocking && cl->writing_state != MEI_WRITE_COMPLETE) { mutex_unlock(&dev->device_lock); - rets = wait_event_interruptible(cl->tx_wait, - cl->writing_state == MEI_WRITE_COMPLETE || - (!mei_cl_is_connected(cl))); + rets = wait_event_interruptible_timeout(cl->tx_wait, + cl->writing_state == MEI_WRITE_COMPLETE || + (!mei_cl_is_connected(cl)), + msecs_to_jiffies(timeout)); mutex_lock(&dev->device_lock); + /* clean all queue on timeout as something fatal happened */ + if (rets == 0) { + rets = -ETIME; + mei_io_tx_list_free_cl(&dev->write_list, cl, NULL); + mei_io_tx_list_free_cl(&dev->write_waiting_list, cl, NULL); + } /* wait_event_interruptible returns -ERESTARTSYS */ + if (rets > 0) + rets = 0; if (rets) { if (signal_pending(current)) rets = -EINTR; diff --git a/drivers/misc/mei/client.h b/drivers/misc/mei/client.h index 418056fb1489..9052860bcfe0 100644 --- a/drivers/misc/mei/client.h +++ b/drivers/misc/mei/client.h @@ -246,7 +246,7 @@ int mei_cl_connect(struct mei_cl *cl, struct mei_me_client *me_cl, int mei_cl_irq_connect(struct mei_cl *cl, struct mei_cl_cb *cb, struct list_head *cmpl_list); int mei_cl_read_start(struct mei_cl *cl, size_t length, const struct file *fp); -ssize_t mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb); +ssize_t mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb, unsigned long timeout); int mei_cl_irq_write(struct mei_cl *cl, struct mei_cl_cb *cb, struct list_head *cmpl_list); diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index 930887e7e38d..632d4ae21e46 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -383,7 +383,7 @@ static ssize_t mei_write(struct file *file, const char __user *ubuf, goto out; } - rets = mei_cl_write(cl, cb); + rets = mei_cl_write(cl, cb, MAX_SCHEDULE_TIMEOUT); out: mutex_unlock(&dev->device_lock); return rets; diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h index 8d8018428d9d..996b70a988be 100644 --- a/drivers/misc/mei/mei_dev.h +++ b/drivers/misc/mei/mei_dev.h @@ -379,6 +379,8 @@ void mei_cl_bus_rescan_work(struct work_struct *work); void mei_cl_bus_dev_fixup(struct mei_cl_device *dev); ssize_t __mei_cl_send(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, unsigned int mode); +ssize_t __mei_cl_send_timeout(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, + unsigned int mode, unsigned long timeout); ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length, u8 *vtag, unsigned int mode, unsigned long timeout); bool mei_cl_bus_rx_event(struct mei_cl *cl); diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index e401a51596b9..92ab49705f64 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -193,6 +193,18 @@ static int read_dvsec_vendor(struct pci_dev *dev) return 0; } +/** + * get_dvsec_vendor0() - Find a related PCI device (function 0) + * @dev: PCI device to match + * @dev0: The PCI device (function 0) found + * @out_pos: The position of PCI device (function 0) + * + * Returns 0 on success, negative on failure. + * + * NOTE: If it's successful, the reference of dev0 is increased, + * so after using it, the callers must call pci_dev_put() to give + * up the reference. + */ static int get_dvsec_vendor0(struct pci_dev *dev, struct pci_dev **dev0, int *out_pos) { @@ -202,10 +214,14 @@ static int get_dvsec_vendor0(struct pci_dev *dev, struct pci_dev **dev0, dev = get_function_0(dev); if (!dev) return -1; + } else { + dev = pci_dev_get(dev); } pos = find_dvsec(dev, OCXL_DVSEC_VENDOR_ID); - if (!pos) + if (!pos) { + pci_dev_put(dev); return -1; + } *dev0 = dev; *out_pos = pos; return 0; @@ -222,6 +238,7 @@ int ocxl_config_get_reset_reload(struct pci_dev *dev, int *val) pci_read_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, &reset_reload); + pci_dev_put(dev0); *val = !!(reset_reload & BIT(0)); return 0; } @@ -243,6 +260,7 @@ int ocxl_config_set_reset_reload(struct pci_dev *dev, int val) reset_reload &= ~BIT(0); pci_write_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, reset_reload); + pci_dev_put(dev0); return 0; } diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index d46dba2df5a1..452d5777a0e4 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -541,8 +541,11 @@ int ocxl_file_register_afu(struct ocxl_afu *afu) goto err_put; rc = device_register(&info->dev); - if (rc) - goto err_put; + if (rc) { + free_minor(info); + put_device(&info->dev); + return rc; + } rc = ocxl_sysfs_register_afu(info); if (rc) diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index d7ef61e602ed..b836936e9747 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -648,6 +648,7 @@ int gru_handle_user_call_os(unsigned long cb) if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB) return -EINVAL; +again: gts = gru_find_lock_gts(cb); if (!gts) return -EINVAL; @@ -656,7 +657,11 @@ int gru_handle_user_call_os(unsigned long cb) if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) goto exit; - gru_check_context_placement(gts); + if (gru_check_context_placement(gts)) { + gru_unlock_gts(gts); + gru_unload_context(gts, 1); + goto again; + } /* * CCH may contain stale data if ts_force_cch_reload is set. @@ -874,7 +879,11 @@ int gru_set_context_option(unsigned long arg) } else { gts->ts_user_blade_id = req.val1; gts->ts_user_chiplet_id = req.val0; - gru_check_context_placement(gts); + if (gru_check_context_placement(gts)) { + gru_unlock_gts(gts); + gru_unload_context(gts, 1); + return ret; + } } break; case sco_gseg_owner: diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 6706ef3c5977..4eb4b9455139 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -716,9 +716,10 @@ static int gru_check_chiplet_assignment(struct gru_state *gru, * chiplet. Misassignment can occur if the process migrates to a different * blade or if the user changes the selected blade/chiplet. */ -void gru_check_context_placement(struct gru_thread_state *gts) +int gru_check_context_placement(struct gru_thread_state *gts) { struct gru_state *gru; + int ret = 0; /* * If the current task is the context owner, verify that the @@ -726,15 +727,23 @@ void gru_check_context_placement(struct gru_thread_state *gts) * references. Pthread apps use non-owner references to the CBRs. */ gru = gts->ts_gru; + /* + * If gru or gts->ts_tgid_owner isn't initialized properly, return + * success to indicate that the caller does not need to unload the + * gru context.The caller is responsible for their inspection and + * reinitialization if needed. + */ if (!gru || gts->ts_tgid_owner != current->tgid) - return; + return ret; if (!gru_check_chiplet_assignment(gru, gts)) { STAT(check_context_unload); - gru_unload_context(gts, 1); + ret = -EINVAL; } else if (gru_retarget_intr(gts)) { STAT(check_context_retarget_intr); } + + return ret; } @@ -934,7 +943,12 @@ again: mutex_lock(>s->ts_ctxlock); preempt_disable(); - gru_check_context_placement(gts); + if (gru_check_context_placement(gts)) { + preempt_enable(); + mutex_unlock(>s->ts_ctxlock); + gru_unload_context(gts, 1); + return VM_FAULT_NOPAGE; + } if (!gts->ts_gru) { STAT(load_user_context); diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h index 8c52776db234..640daf1994df 100644 --- a/drivers/misc/sgi-gru/grutables.h +++ b/drivers/misc/sgi-gru/grutables.h @@ -632,7 +632,7 @@ extern int gru_user_flush_tlb(unsigned long arg); extern int gru_user_unload_context(unsigned long arg); extern int gru_get_exception_detail(unsigned long arg); extern int gru_set_context_option(unsigned long address); -extern void gru_check_context_placement(struct gru_thread_state *gts); +extern int gru_check_context_placement(struct gru_thread_state *gts); extern int gru_cpu_fault_map_id(void); extern struct vm_area_struct *gru_find_vma(unsigned long vaddr); extern void gru_flush_all_tlb(struct gru_state *gru); diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c new file mode 100644 index 000000000000..d1431d419aa4 --- /dev/null +++ b/drivers/misc/smpro-errmon.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ampere Computing SoC's SMpro Error Monitoring Driver + * + * Copyright (c) 2022, Ampere Computing LLC + * + */ + +#include <linux/i2c.h> +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/regmap.h> + +/* GPI RAS Error Registers */ +#define GPI_RAS_ERR 0x7E + +/* Core and L2C Error Registers */ +#define CORE_CE_ERR_CNT 0x80 +#define CORE_CE_ERR_LEN 0x81 +#define CORE_CE_ERR_DATA 0x82 +#define CORE_UE_ERR_CNT 0x83 +#define CORE_UE_ERR_LEN 0x84 +#define CORE_UE_ERR_DATA 0x85 + +/* Memory Error Registers */ +#define MEM_CE_ERR_CNT 0x90 +#define MEM_CE_ERR_LEN 0x91 +#define MEM_CE_ERR_DATA 0x92 +#define MEM_UE_ERR_CNT 0x93 +#define MEM_UE_ERR_LEN 0x94 +#define MEM_UE_ERR_DATA 0x95 + +/* RAS Error/Warning Registers */ +#define ERR_SMPRO_TYPE 0xA0 +#define ERR_PMPRO_TYPE 0xA1 +#define ERR_SMPRO_INFO_LO 0xA2 +#define ERR_SMPRO_INFO_HI 0xA3 +#define ERR_SMPRO_DATA_LO 0xA4 +#define ERR_SMPRO_DATA_HI 0xA5 +#define WARN_SMPRO_INFO_LO 0xAA +#define WARN_SMPRO_INFO_HI 0xAB +#define ERR_PMPRO_INFO_LO 0xA6 +#define ERR_PMPRO_INFO_HI 0xA7 +#define ERR_PMPRO_DATA_LO 0xA8 +#define ERR_PMPRO_DATA_HI 0xA9 +#define WARN_PMPRO_INFO_LO 0xAC +#define WARN_PMPRO_INFO_HI 0xAD + +/* PCIE Error Registers */ +#define PCIE_CE_ERR_CNT 0xC0 +#define PCIE_CE_ERR_LEN 0xC1 +#define PCIE_CE_ERR_DATA 0xC2 +#define PCIE_UE_ERR_CNT 0xC3 +#define PCIE_UE_ERR_LEN 0xC4 +#define PCIE_UE_ERR_DATA 0xC5 + +/* Other Error Registers */ +#define OTHER_CE_ERR_CNT 0xD0 +#define OTHER_CE_ERR_LEN 0xD1 +#define OTHER_CE_ERR_DATA 0xD2 +#define OTHER_UE_ERR_CNT 0xD8 +#define OTHER_UE_ERR_LEN 0xD9 +#define OTHER_UE_ERR_DATA 0xDA + +/* Event Data Registers */ +#define VRD_WARN_FAULT_EVENT_DATA 0x78 +#define VRD_HOT_EVENT_DATA 0x79 +#define DIMM_HOT_EVENT_DATA 0x7A + +#define MAX_READ_BLOCK_LENGTH 48 + +#define RAS_SMPRO_ERR 0 +#define RAS_PMPRO_ERR 1 + +enum RAS_48BYTES_ERR_TYPES { + CORE_CE_ERR, + CORE_UE_ERR, + MEM_CE_ERR, + MEM_UE_ERR, + PCIE_CE_ERR, + PCIE_UE_ERR, + OTHER_CE_ERR, + OTHER_UE_ERR, + NUM_48BYTES_ERR_TYPE, +}; + +struct smpro_error_hdr { + u8 count; /* Number of the RAS errors */ + u8 len; /* Number of data bytes */ + u8 data; /* Start of 48-byte data */ + u8 max_cnt; /* Max num of errors */ +}; + +/* + * Included Address of registers to get Count, Length of data and Data + * of the 48 bytes error data + */ +static struct smpro_error_hdr smpro_error_table[] = { + [CORE_CE_ERR] = { + .count = CORE_CE_ERR_CNT, + .len = CORE_CE_ERR_LEN, + .data = CORE_CE_ERR_DATA, + .max_cnt = 32 + }, + [CORE_UE_ERR] = { + .count = CORE_UE_ERR_CNT, + .len = CORE_UE_ERR_LEN, + .data = CORE_UE_ERR_DATA, + .max_cnt = 32 + }, + [MEM_CE_ERR] = { + .count = MEM_CE_ERR_CNT, + .len = MEM_CE_ERR_LEN, + .data = MEM_CE_ERR_DATA, + .max_cnt = 16 + }, + [MEM_UE_ERR] = { + .count = MEM_UE_ERR_CNT, + .len = MEM_UE_ERR_LEN, + .data = MEM_UE_ERR_DATA, + .max_cnt = 16 + }, + [PCIE_CE_ERR] = { + .count = PCIE_CE_ERR_CNT, + .len = PCIE_CE_ERR_LEN, + .data = PCIE_CE_ERR_DATA, + .max_cnt = 96 + }, + [PCIE_UE_ERR] = { + .count = PCIE_UE_ERR_CNT, + .len = PCIE_UE_ERR_LEN, + .data = PCIE_UE_ERR_DATA, + .max_cnt = 96 + }, + [OTHER_CE_ERR] = { + .count = OTHER_CE_ERR_CNT, + .len = OTHER_CE_ERR_LEN, + .data = OTHER_CE_ERR_DATA, + .max_cnt = 8 + }, + [OTHER_UE_ERR] = { + .count = OTHER_UE_ERR_CNT, + .len = OTHER_UE_ERR_LEN, + .data = OTHER_UE_ERR_DATA, + .max_cnt = 8 + }, +}; + +/* + * List of SCP registers which are used to get + * one type of RAS Internal errors. + */ +struct smpro_int_error_hdr { + u8 type; + u8 info_l; + u8 info_h; + u8 data_l; + u8 data_h; + u8 warn_l; + u8 warn_h; +}; + +static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = { + [RAS_SMPRO_ERR] = { + .type = ERR_SMPRO_TYPE, + .info_l = ERR_SMPRO_INFO_LO, + .info_h = ERR_SMPRO_INFO_HI, + .data_l = ERR_SMPRO_DATA_LO, + .data_h = ERR_SMPRO_DATA_HI, + .warn_l = WARN_SMPRO_INFO_LO, + .warn_h = WARN_SMPRO_INFO_HI, + }, + [RAS_PMPRO_ERR] = { + .type = ERR_PMPRO_TYPE, + .info_l = ERR_PMPRO_INFO_LO, + .info_h = ERR_PMPRO_INFO_HI, + .data_l = ERR_PMPRO_DATA_LO, + .data_h = ERR_PMPRO_DATA_HI, + .warn_l = WARN_PMPRO_INFO_LO, + .warn_h = WARN_PMPRO_INFO_HI, + }, +}; + +struct smpro_errmon { + struct regmap *regmap; +}; + +enum EVENT_TYPES { + VRD_WARN_FAULT_EVENT, + VRD_HOT_EVENT, + DIMM_HOT_EVENT, + NUM_EVENTS_TYPE, +}; + +/* Included Address of event source and data registers */ +static u8 smpro_event_table[NUM_EVENTS_TYPE] = { + VRD_WARN_FAULT_EVENT_DATA, + VRD_HOT_EVENT_DATA, + DIMM_HOT_EVENT_DATA, +}; + +static ssize_t smpro_event_data_read(struct device *dev, + struct device_attribute *da, char *buf, + int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + s32 event_data; + int ret; + + ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data); + if (ret) + return ret; + /* Clear event after read */ + if (event_data != 0) + regmap_write(errmon->regmap, smpro_event_table[channel], event_data); + + return sysfs_emit(buf, "%04x\n", event_data); +} + +static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_error_hdr *err_info; + s32 err_count; + int ret; + + err_info = &smpro_error_table[channel]; + + ret = regmap_read(errmon->regmap, err_info->count, &err_count); + if (ret) + return ret; + + /* Bit 8 indicates the overflow status */ + return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0); +} + +static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + unsigned char err_data[MAX_READ_BLOCK_LENGTH]; + struct smpro_error_hdr *err_info; + s32 err_count, err_length; + int ret; + + err_info = &smpro_error_table[channel]; + + ret = regmap_read(errmon->regmap, err_info->count, &err_count); + /* Error count is the low byte */ + err_count &= 0xff; + if (ret || !err_count || err_count > err_info->max_cnt) + return ret; + + ret = regmap_read(errmon->regmap, err_info->len, &err_length); + if (ret || err_length <= 0) + return ret; + + if (err_length > MAX_READ_BLOCK_LENGTH) + err_length = MAX_READ_BLOCK_LENGTH; + + memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH); + ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length); + if (ret < 0) + return ret; + + /* clear the error */ + ret = regmap_write(errmon->regmap, err_info->count, 0x100); + if (ret) + return ret; + /* + * The output of Core/Memory/PCIe/Others UE/CE errors follows the format + * specified in section 5.8.1 CE/UE Error Data record in + * Altra SOC BMC Interface specification. + */ + return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data); +} + +/* + * Output format: + * <4-byte hex value of error info><4-byte hex value of error extensive data> + * Where: + * + error info : The error information + * + error data : Extensive data (32 bits) + * Reference to section 5.10 RAS Internal Error Register Definition in + * Altra SOC BMC Interface specification + */ +static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_int_error_hdr *err_info; + unsigned int err[4] = { 0 }; + unsigned int err_type; + unsigned int val; + int ret; + + /* read error status */ + ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); + if (ret) + return ret; + + if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || + (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) + return 0; + + err_info = &list_smpro_int_error_hdr[channel]; + ret = regmap_read(errmon->regmap, err_info->type, &val); + if (ret) + return ret; + + err_type = (val & BIT(1)) ? BIT(1) : + (val & BIT(2)) ? BIT(2) : 0; + + if (!err_type) + return 0; + + ret = regmap_read(errmon->regmap, err_info->info_l, err + 1); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->info_h, err); + if (ret) + return ret; + + if (err_type & BIT(2)) { + /* Error with data type */ + ret = regmap_read(errmon->regmap, err_info->data_l, err + 3); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->data_h, err + 2); + if (ret) + return ret; + } + + /* clear the read errors */ + ret = regmap_write(errmon->regmap, err_info->type, err_type); + if (ret) + return ret; + + return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err); +} + +/* + * Output format: + * <4-byte hex value of warining info> + * Reference to section 5.10 RAS Internal Error Register Definition in + * Altra SOC BMC Interface specification + */ +static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_int_error_hdr *err_info; + unsigned int warn[2] = { 0 }; + unsigned int val; + int ret; + + /* read error status */ + ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); + if (ret) + return ret; + + if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || + (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) + return 0; + + err_info = &list_smpro_int_error_hdr[channel]; + ret = regmap_read(errmon->regmap, err_info->type, &val); + if (ret) + return ret; + + if (!(val & BIT(0))) + return 0; + + ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->warn_h, warn); + if (ret) + return ret; + + /* clear the warning */ + ret = regmap_write(errmon->regmap, err_info->type, BIT(0)); + if (ret) + return ret; + + return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn); +} + +#define ERROR_OVERFLOW_RO(_error, _index) \ + static ssize_t overflow_##_error##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_overflow_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(overflow_##_error) + +ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR); +ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR); +ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR); +ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR); +ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR); +ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR); +ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR); +ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR); + +#define ERROR_RO(_error, _index) \ + static ssize_t error_##_error##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_error_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(error_##_error) + +ERROR_RO(core_ce, CORE_CE_ERR); +ERROR_RO(core_ue, CORE_UE_ERR); +ERROR_RO(mem_ce, MEM_CE_ERR); +ERROR_RO(mem_ue, MEM_UE_ERR); +ERROR_RO(pcie_ce, PCIE_CE_ERR); +ERROR_RO(pcie_ue, PCIE_UE_ERR); +ERROR_RO(other_ce, OTHER_CE_ERR); +ERROR_RO(other_ue, OTHER_UE_ERR); + +static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR); +} +static DEVICE_ATTR_RO(error_smpro); + +static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR); +} +static DEVICE_ATTR_RO(error_pmpro); + +static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR); +} +static DEVICE_ATTR_RO(warn_smpro); + +static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR); +} +static DEVICE_ATTR_RO(warn_pmpro); + +#define EVENT_RO(_event, _index) \ + static ssize_t event_##_event##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_event_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(event_##_event) + +EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT); +EVENT_RO(vrd_hot, VRD_HOT_EVENT); +EVENT_RO(dimm_hot, DIMM_HOT_EVENT); + +static struct attribute *smpro_errmon_attrs[] = { + &dev_attr_overflow_core_ce.attr, + &dev_attr_overflow_core_ue.attr, + &dev_attr_overflow_mem_ce.attr, + &dev_attr_overflow_mem_ue.attr, + &dev_attr_overflow_pcie_ce.attr, + &dev_attr_overflow_pcie_ue.attr, + &dev_attr_overflow_other_ce.attr, + &dev_attr_overflow_other_ue.attr, + &dev_attr_error_core_ce.attr, + &dev_attr_error_core_ue.attr, + &dev_attr_error_mem_ce.attr, + &dev_attr_error_mem_ue.attr, + &dev_attr_error_pcie_ce.attr, + &dev_attr_error_pcie_ue.attr, + &dev_attr_error_other_ce.attr, + &dev_attr_error_other_ue.attr, + &dev_attr_error_smpro.attr, + &dev_attr_error_pmpro.attr, + &dev_attr_warn_smpro.attr, + &dev_attr_warn_pmpro.attr, + &dev_attr_event_vrd_warn_fault.attr, + &dev_attr_event_vrd_hot.attr, + &dev_attr_event_dimm_hot.attr, + NULL +}; + +ATTRIBUTE_GROUPS(smpro_errmon); + +static int smpro_errmon_probe(struct platform_device *pdev) +{ + struct smpro_errmon *errmon; + + errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL); + if (!errmon) + return -ENOMEM; + + platform_set_drvdata(pdev, errmon); + + errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!errmon->regmap) + return -ENODEV; + + return 0; +} + +static struct platform_driver smpro_errmon_driver = { + .probe = smpro_errmon_probe, + .driver = { + .name = "smpro-errmon", + .dev_groups = smpro_errmon_groups, + }, +}; + +module_platform_driver(smpro_errmon_driver); + +MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>"); +MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>"); +MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>"); +MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>"); +MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>"); +MODULE_DESCRIPTION("Ampere Altra SMpro driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/misc/smpro-misc.c b/drivers/misc/smpro-misc.c new file mode 100644 index 000000000000..6c427141e51b --- /dev/null +++ b/drivers/misc/smpro-misc.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ampere Computing SoC's SMpro Misc Driver + * + * Copyright (c) 2022, Ampere Computing LLC + */ +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/regmap.h> + +/* Boot Stage/Progress Registers */ +#define BOOTSTAGE 0xB0 +#define BOOTSTAGE_LO 0xB1 +#define CUR_BOOTSTAGE 0xB2 +#define BOOTSTAGE_HI 0xB3 + +/* SOC State Registers */ +#define SOC_POWER_LIMIT 0xE5 + +struct smpro_misc { + struct regmap *regmap; +}; + +static ssize_t boot_progress_show(struct device *dev, struct device_attribute *da, char *buf) +{ + struct smpro_misc *misc = dev_get_drvdata(dev); + u16 boot_progress[3] = { 0 }; + u32 bootstage; + u8 boot_stage; + u8 cur_stage; + u32 reg_lo; + u32 reg; + int ret; + + /* Read current boot stage */ + ret = regmap_read(misc->regmap, CUR_BOOTSTAGE, ®); + if (ret) + return ret; + + cur_stage = reg & 0xff; + + ret = regmap_read(misc->regmap, BOOTSTAGE, &bootstage); + if (ret) + return ret; + + boot_stage = (bootstage >> 8) & 0xff; + + if (boot_stage > cur_stage) + return -EINVAL; + + ret = regmap_read(misc->regmap, BOOTSTAGE_LO, ®_lo); + if (!ret) + ret = regmap_read(misc->regmap, BOOTSTAGE_HI, ®); + if (ret) + return ret; + + /* Firmware to report new boot stage next time */ + if (boot_stage < cur_stage) { + ret = regmap_write(misc->regmap, BOOTSTAGE, ((bootstage & 0xff00) | 0x1)); + if (ret) + return ret; + } + + boot_progress[0] = bootstage; + boot_progress[1] = swab16(reg); + boot_progress[2] = swab16(reg_lo); + + return sysfs_emit(buf, "%*phN\n", (int)sizeof(boot_progress), boot_progress); +} + +static DEVICE_ATTR_RO(boot_progress); + +static ssize_t soc_power_limit_show(struct device *dev, struct device_attribute *da, char *buf) +{ + struct smpro_misc *misc = dev_get_drvdata(dev); + unsigned int value; + int ret; + + ret = regmap_read(misc->regmap, SOC_POWER_LIMIT, &value); + if (ret) + return ret; + + return sysfs_emit(buf, "%d\n", value); +} + +static ssize_t soc_power_limit_store(struct device *dev, struct device_attribute *da, + const char *buf, size_t count) +{ + struct smpro_misc *misc = dev_get_drvdata(dev); + unsigned long val; + s32 ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + ret = regmap_write(misc->regmap, SOC_POWER_LIMIT, (unsigned int)val); + if (ret) + return -EPROTO; + + return count; +} + +static DEVICE_ATTR_RW(soc_power_limit); + +static struct attribute *smpro_misc_attrs[] = { + &dev_attr_boot_progress.attr, + &dev_attr_soc_power_limit.attr, + NULL +}; + +ATTRIBUTE_GROUPS(smpro_misc); + +static int smpro_misc_probe(struct platform_device *pdev) +{ + struct smpro_misc *misc; + + misc = devm_kzalloc(&pdev->dev, sizeof(struct smpro_misc), GFP_KERNEL); + if (!misc) + return -ENOMEM; + + platform_set_drvdata(pdev, misc); + + misc->regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!misc->regmap) + return -ENODEV; + + return 0; +} + +static struct platform_driver smpro_misc_driver = { + .probe = smpro_misc_probe, + .driver = { + .name = "smpro-misc", + .dev_groups = smpro_misc_groups, + }, +}; + +module_platform_driver(smpro_misc_driver); + +MODULE_AUTHOR("Tung Nguyen <tungnguyen@os.amperecomputing.com>"); +MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>"); +MODULE_DESCRIPTION("Ampere Altra SMpro Misc driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/misc/tifm_7xx1.c b/drivers/misc/tifm_7xx1.c index 017c2f7d6287..7dd86a9858ab 100644 --- a/drivers/misc/tifm_7xx1.c +++ b/drivers/misc/tifm_7xx1.c @@ -190,7 +190,7 @@ static void tifm_7xx1_switch_media(struct work_struct *work) spin_unlock_irqrestore(&fm->lock, flags); } if (sock) - tifm_free_device(&sock->dev); + put_device(&sock->dev); } spin_lock_irqsave(&fm->lock, flags); } diff --git a/drivers/misc/tsl2550.c b/drivers/misc/tsl2550.c index 1652fb9b3856..6c62b94e0acd 100644 --- a/drivers/misc/tsl2550.c +++ b/drivers/misc/tsl2550.c @@ -331,8 +331,7 @@ static int tsl2550_init_client(struct i2c_client *client) */ static struct i2c_driver tsl2550_driver; -static int tsl2550_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int tsl2550_probe(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; struct tsl2550_data *data; @@ -438,7 +437,7 @@ static struct i2c_driver tsl2550_driver = { .of_match_table = tsl2550_of_match, .pm = TSL2550_PM_OPS, }, - .probe = tsl2550_probe, + .probe_new = tsl2550_probe, .remove = tsl2550_remove, .id_table = tsl2550_id, }; |