diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-06 08:59:35 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-06 08:59:35 -0700 |
commit | a9cf69d0e7f2051cca1c08ed9b34fe79da951ee9 (patch) | |
tree | 80af592fdcb2a815a35272cd8375aa03c7d2f04a /drivers/vfio | |
parent | 6614a3c3164a5df2b54abb0b3559f51041cf705b (diff) | |
parent | 099fd2c2020751737d9288f923d562e0e05977eb (diff) |
Merge tag 'vfio-v6.0-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Cleanup use of extern in function prototypes (Alex Williamson)
- Simplify bus_type usage and convert to device IOMMU interfaces (Robin
Murphy)
- Check missed return value and fix comment typos (Bo Liu)
- Split migration ops from device ops and fix races in mlx5 migration
support (Yishai Hadas)
- Fix missed return value check in noiommu support (Liam Ni)
- Hardening to clear buffer pointer to avoid use-after-free (Schspa
Shi)
- Remove requirement that only the same mm can unmap a previously
mapped range (Li Zhe)
- Adjust semaphore release vs device open counter (Yi Liu)
- Remove unused arg from SPAPR support code (Deming Wang)
- Rework vfio-ccw driver to better fit new mdev framework (Eric Farman,
Michael Kawano)
- Replace DMA unmap notifier with callbacks (Jason Gunthorpe)
- Clarify SPAPR support comment relative to iommu_ops (Alexey
Kardashevskiy)
- Revise page pinning API towards compatibility with future iommufd
support (Nicolin Chen)
- Resolve issues in vfio-ccw, including use of DMA unmap callback (Eric
Farman)
* tag 'vfio-v6.0-rc1' of https://github.com/awilliam/linux-vfio: (40 commits)
vfio/pci: fix the wrong word
vfio/ccw: Check return code from subchannel quiesce
vfio/ccw: Remove FSM Close from remove handlers
vfio/ccw: Add length to DMA_UNMAP checks
vfio: Replace phys_pfn with pages for vfio_pin_pages()
vfio/ccw: Add kmap_local_page() for memcpy
vfio: Rename user_iova of vfio_dma_rw()
vfio/ccw: Change pa_pfn list to pa_iova list
vfio/ap: Change saved_pfn to saved_iova
vfio: Pass in starting IOVA to vfio_pin/unpin_pages API
vfio/ccw: Only pass in contiguous pages
vfio/ap: Pass in physical address of ind to ap_aqic()
drm/i915/gvt: Replace roundup with DIV_ROUND_UP
vfio: Make vfio_unpin_pages() return void
vfio/spapr_tce: Fix the comment
vfio: Replace the iommu notifier with a device list
vfio: Replace the DMA unmapping notifier with a callback
vfio/ccw: Move FSM open/close to MDEV open/close
vfio/ccw: Refactor vfio_ccw_mdev_reset
vfio/ccw: Create a CLOSE FSM event
...
Diffstat (limited to 'drivers/vfio')
-rw-r--r-- | drivers/vfio/fsl-mc/vfio_fsl_mc_private.h | 2 | ||||
-rw-r--r-- | drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 11 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.c | 14 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.h | 4 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/main.c | 11 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 4 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_core.c | 7 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_platform_private.h | 21 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 192 | ||||
-rw-r--r-- | drivers/vfio/vfio.h | 17 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_spapr_tce.c | 14 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 197 |
12 files changed, 241 insertions, 253 deletions
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h index 4ad63ececb91..7a29f572f93d 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h @@ -39,7 +39,7 @@ struct vfio_fsl_mc_device { struct vfio_fsl_mc_irq *mc_irqs; }; -extern int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev, +int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev, u32 flags, unsigned int index, unsigned int start, unsigned int count, void *data); diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 4def43f5f7b6..ea762e28c1cc 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1185,7 +1185,7 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) if (ret) return ret; - if (core_vdev->ops->migration_set_state) { + if (core_vdev->mig_ops) { ret = hisi_acc_vf_qm_init(hisi_acc_vdev); if (ret) { vfio_pci_core_disable(vdev); @@ -1208,6 +1208,11 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { + .migration_set_state = hisi_acc_vfio_pci_set_device_state, + .migration_get_state = hisi_acc_vfio_pci_get_device_state, +}; + static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .name = "hisi-acc-vfio-pci-migration", .open_device = hisi_acc_vfio_pci_open_device, @@ -1219,8 +1224,6 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, - .migration_set_state = hisi_acc_vfio_pci_set_device_state, - .migration_get_state = hisi_acc_vfio_pci_get_device_state, }; static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { @@ -1272,6 +1275,8 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device if (!ret) { vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev, &hisi_acc_vfio_pci_migrn_ops); + hisi_acc_vdev->core_device.vdev.mig_ops = + &hisi_acc_vfio_pci_migrn_state_ops; } else { pci_warn(pdev, "migration support failed, continue with generic interface\n"); vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev, diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 9b9f33ca270a..dd5d7bfe0a49 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -88,6 +88,16 @@ static int mlx5fv_vf_event(struct notifier_block *nb, return 0; } +void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) +{ + if (!mvdev->migrate_cap) + return; + + mutex_lock(&mvdev->state_mutex); + mlx5vf_disable_fds(mvdev); + mlx5vf_state_mutex_unlock(mvdev); +} + void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) { if (!mvdev->migrate_cap) @@ -98,7 +108,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) destroy_workqueue(mvdev->cb_wq); } -void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev) +void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, + const struct vfio_migration_ops *mig_ops) { struct pci_dev *pdev = mvdev->core_device.pdev; int ret; @@ -139,6 +150,7 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev) mvdev->core_device.vdev.migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; + mvdev->core_device.vdev.mig_ops = mig_ops; end: mlx5_vf_put_core_dev(mvdev->mdev); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 6c3112fdd8b1..8208f4701a90 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -62,8 +62,10 @@ int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size); -void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, + const struct vfio_migration_ops *mig_ops); void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); +void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, struct mlx5_vf_migration_file *migf); int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 0558d0649ddb..a9b63d15c5d3 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -570,10 +570,15 @@ static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) struct mlx5vf_pci_core_device *mvdev = container_of( core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); - mlx5vf_disable_fds(mvdev); + mlx5vf_cmd_close_migratable(mvdev); vfio_pci_core_close_device(core_vdev); } +static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { + .migration_set_state = mlx5vf_pci_set_device_state, + .migration_get_state = mlx5vf_pci_get_device_state, +}; + static const struct vfio_device_ops mlx5vf_pci_ops = { .name = "mlx5-vfio-pci", .open_device = mlx5vf_pci_open_device, @@ -585,8 +590,6 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, - .migration_set_state = mlx5vf_pci_set_device_state, - .migration_get_state = mlx5vf_pci_get_device_state, }; static int mlx5vf_pci_probe(struct pci_dev *pdev, @@ -599,7 +602,7 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, if (!mvdev) return -ENOMEM; vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); - mlx5vf_cmd_set_migratable(mvdev); + mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops); dev_set_drvdata(&pdev->dev, &mvdev->core_device); ret = vfio_pci_core_register_device(&mvdev->core_device); if (ret) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 9343f597182d..442d3ba4122b 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -222,7 +222,7 @@ static int vfio_default_config_write(struct vfio_pci_core_device *vdev, int pos, memcpy(vdev->vconfig + pos, &virt_val, count); } - /* Non-virtualzed and writable bits go to hardware */ + /* Non-virtualized and writable bits go to hardware */ if (write & ~virt) { struct pci_dev *pdev = vdev->pdev; __le32 phys_val = 0; @@ -1728,7 +1728,7 @@ int vfio_config_init(struct vfio_pci_core_device *vdev) /* * Config space, caps and ecaps are all dword aligned, so we could * use one byte per dword to record the type. However, there are - * no requiremenst on the length of a capability, so the gap between + * no requirements on the length of a capability, so the gap between * capabilities needs byte granularity. */ map = kmalloc(pdev->cfg_size, GFP_KERNEL); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 0d8d4cdfb2f0..c8d3b0450fb3 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1868,6 +1868,13 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; + if (vdev->vdev.mig_ops) { + if (!(vdev->vdev.mig_ops->migration_get_state && + vdev->vdev.mig_ops->migration_set_state) || + !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY)) + return -EINVAL; + } + /* * Prevent binding to PFs with VFs enabled, the VFs might be in use * by the host or other users. We cannot capture the VFs if they diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 520d2a8e8375..691b43f4b2b2 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -78,21 +78,20 @@ struct vfio_platform_reset_node { vfio_platform_reset_fn_t of_reset; }; -extern int vfio_platform_probe_common(struct vfio_platform_device *vdev, - struct device *dev); +int vfio_platform_probe_common(struct vfio_platform_device *vdev, + struct device *dev); void vfio_platform_remove_common(struct vfio_platform_device *vdev); -extern int vfio_platform_irq_init(struct vfio_platform_device *vdev); -extern void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev); +int vfio_platform_irq_init(struct vfio_platform_device *vdev); +void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev); -extern int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev, - uint32_t flags, unsigned index, - unsigned start, unsigned count, - void *data); +int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev, + uint32_t flags, unsigned index, + unsigned start, unsigned count, void *data); -extern void __vfio_platform_register_reset(struct vfio_platform_reset_node *n); -extern void vfio_platform_unregister_reset(const char *compat, - vfio_platform_reset_fn_t fn); +void __vfio_platform_register_reset(struct vfio_platform_reset_node *n); +void vfio_platform_unregister_reset(const char *compat, + vfio_platform_reset_fn_t fn); #define vfio_platform_register_reset(__compat, __reset) \ static struct vfio_platform_reset_node __reset ## _node = { \ .owner = THIS_MODULE, \ diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 521a3eabf0e1..7cb56c382c97 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -231,6 +231,9 @@ int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) { struct vfio_iommu_driver *driver, *tmp; + if (WARN_ON(!ops->register_device != !ops->unregister_device)) + return -EINVAL; + driver = kzalloc(sizeof(*driver), GFP_KERNEL); if (!driver) return -ENOMEM; @@ -504,7 +507,9 @@ static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, if (IS_ERR(iommu_group)) return ERR_CAST(iommu_group); - iommu_group_set_name(iommu_group, "vfio-noiommu"); + ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); + if (ret) + goto out_put_group; ret = iommu_group_add_device(iommu_group, dev); if (ret) goto out_put_group; @@ -554,7 +559,7 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) * restore cache coherency. It has to be checked here because it is only * valid for cases where we are using iommu groups. */ - if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) { + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { iommu_group_put(iommu_group); return ERR_PTR(-EINVAL); } @@ -1082,6 +1087,7 @@ static void vfio_device_unassign_container(struct vfio_device *device) static struct file *vfio_device_open(struct vfio_device *device) { + struct vfio_iommu_driver *iommu_driver; struct file *filep; int ret; @@ -1112,6 +1118,12 @@ static struct file *vfio_device_open(struct vfio_device *device) if (ret) goto err_undo_count; } + + iommu_driver = device->group->container->iommu_driver; + if (iommu_driver && iommu_driver->ops->register_device) + iommu_driver->ops->register_device( + device->group->container->iommu_data, device); + up_read(&device->group->group_rwsem); } mutex_unlock(&device->dev_set->lock); @@ -1146,13 +1158,19 @@ static struct file *vfio_device_open(struct vfio_device *device) err_close_device: mutex_lock(&device->dev_set->lock); down_read(&device->group->group_rwsem); - if (device->open_count == 1 && device->ops->close_device) + if (device->open_count == 1 && device->ops->close_device) { device->ops->close_device(device); + + iommu_driver = device->group->container->iommu_driver; + if (iommu_driver && iommu_driver->ops->unregister_device) + iommu_driver->ops->unregister_device( + device->group->container->iommu_data, device); + } err_undo_count: + up_read(&device->group->group_rwsem); device->open_count--; if (device->open_count == 0 && device->kvm) device->kvm = NULL; - up_read(&device->group->group_rwsem); mutex_unlock(&device->dev_set->lock); module_put(device->dev->driver->owner); err_unassign_container: @@ -1342,12 +1360,18 @@ static const struct file_operations vfio_group_fops = { static int vfio_device_fops_release(struct inode *inode, struct file *filep) { struct vfio_device *device = filep->private_data; + struct vfio_iommu_driver *iommu_driver; mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); down_read(&device->group->group_rwsem); if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); + + iommu_driver = device->group->container->iommu_driver; + if (iommu_driver && iommu_driver->ops->unregister_device) + iommu_driver->ops->unregister_device( + device->group->container->iommu_data, device); up_read(&device->group->group_rwsem); device->open_count--; if (device->open_count == 0) @@ -1544,8 +1568,7 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, struct file *filp = NULL; int ret; - if (!device->ops->migration_set_state || - !device->ops->migration_get_state) + if (!device->mig_ops) return -ENOTTY; ret = vfio_check_feature(flags, argsz, @@ -1561,7 +1584,8 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, if (flags & VFIO_DEVICE_FEATURE_GET) { enum vfio_device_mig_state curr_state; - ret = device->ops->migration_get_state(device, &curr_state); + ret = device->mig_ops->migration_get_state(device, + &curr_state); if (ret) return ret; mig.device_state = curr_state; @@ -1569,7 +1593,7 @@ vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device, } /* Handle the VFIO_DEVICE_FEATURE_SET */ - filp = device->ops->migration_set_state(device, mig.device_state); + filp = device->mig_ops->migration_set_state(device, mig.device_state); if (IS_ERR(filp) || !filp) goto out_copy; @@ -1592,8 +1616,7 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device, }; int ret; - if (!device->ops->migration_set_state || - !device->ops->migration_get_state) + if (!device->mig_ops) return -ENOTTY; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, @@ -1815,6 +1838,7 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); if (!buf) { kfree(caps->buf); + caps->buf = NULL; caps->size = 0; return ERR_PTR(-ENOMEM); } @@ -1913,26 +1937,25 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); /* - * Pin a set of guest PFNs and return their associated host PFNs for local + * Pin contiguous user pages and return their associated host pages for local * domain only. * @device [in] : device - * @user_pfn [in]: array of user/guest PFNs to be pinned. - * @npage [in] : count of elements in user_pfn array. This count should not - * be greater VFIO_PIN_PAGES_MAX_ENTRIES. + * @iova [in] : starting IOVA of user pages to be pinned. + * @npage [in] : count of pages to be pinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. * @prot [in] : protection flags - * @phys_pfn[out]: array of host PFNs + * @pages[out] : array of host pages * Return error or number of pages pinned. */ -int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, - int npage, int prot, unsigned long *phys_pfn) +int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, + int npage, int prot, struct page **pages) { struct vfio_container *container; struct vfio_group *group = device->group; struct vfio_iommu_driver *driver; int ret; - if (!user_pfn || !phys_pfn || !npage || - !vfio_assert_device_open(device)) + if (!pages || !npage || !vfio_assert_device_open(device)) return -EINVAL; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) @@ -1946,8 +1969,8 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, driver = container->iommu_driver; if (likely(driver && driver->ops->pin_pages)) ret = driver->ops->pin_pages(container->iommu_data, - group->iommu_group, user_pfn, - npage, prot, phys_pfn); + group->iommu_group, iova, + npage, prot, pages); else ret = -ENOTTY; @@ -1956,37 +1979,28 @@ int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn, EXPORT_SYMBOL(vfio_pin_pages); /* - * Unpin set of host PFNs for local domain only. + * Unpin contiguous host pages for local domain only. * @device [in] : device - * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest - * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - * @npage [in] : count of elements in user_pfn array. This count should not + * @iova [in] : starting address of user pages to be unpinned. + * @npage [in] : count of pages to be unpinned. This count should not * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - * Return error or number of pages unpinned. */ -int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn, - int npage) +void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) { struct vfio_container *container; struct vfio_iommu_driver *driver; - int ret; - if (!user_pfn || !npage || !vfio_assert_device_open(device)) - return -EINVAL; + if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) + return; - if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) - return -E2BIG; + if (WARN_ON(!vfio_assert_device_open(device))) + return; /* group->container cannot change while a vfio device is open */ container = device->group->container; driver = container->iommu_driver; - if (likely(driver && driver->ops->unpin_pages)) - ret = driver->ops->unpin_pages(container->iommu_data, user_pfn, - npage); - else - ret = -ENOTTY; - return ret; + driver->ops->unpin_pages(container->iommu_data, iova, npage); } EXPORT_SYMBOL(vfio_unpin_pages); @@ -2001,13 +2015,13 @@ EXPORT_SYMBOL(vfio_unpin_pages); * not a real device DMA, it is not necessary to pin the user space memory. * * @device [in] : VFIO device - * @user_iova [in] : base IOVA of a user space buffer + * @iova [in] : base IOVA of a user space buffer * @data [in] : pointer to kernel buffer * @len [in] : kernel buffer length * @write : indicate read or write * Return error code on failure or 0 on success. */ -int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, +int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, size_t len, bool write) { struct vfio_container *container; @@ -2023,97 +2037,13 @@ int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova, void *data, if (likely(driver && driver->ops->dma_rw)) ret = driver->ops->dma_rw(container->iommu_data, - user_iova, data, len, write); + iova, data, len, write); else ret = -ENOTTY; return ret; } EXPORT_SYMBOL(vfio_dma_rw); -static int vfio_register_iommu_notifier(struct vfio_group *group, - unsigned long *events, - struct notifier_block *nb) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret; - - lockdep_assert_held_read(&group->group_rwsem); - - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->register_notifier)) - ret = driver->ops->register_notifier(container->iommu_data, - events, nb); - else - ret = -ENOTTY; - - return ret; -} - -static int vfio_unregister_iommu_notifier(struct vfio_group *group, - struct notifier_block *nb) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret; - - lockdep_assert_held_read(&group->group_rwsem); - - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->unregister_notifier)) - ret = driver->ops->unregister_notifier(container->iommu_data, - nb); - else - ret = -ENOTTY; - - return ret; -} - -int vfio_register_notifier(struct vfio_device *device, - enum vfio_notify_type type, unsigned long *events, - struct notifier_block *nb) -{ - struct vfio_group *group = device->group; - int ret; - - if (!nb || !events || (*events == 0) || - !vfio_assert_device_open(device)) - return -EINVAL; - - switch (type) { - case VFIO_IOMMU_NOTIFY: - ret = vfio_register_iommu_notifier(group, events, nb); - break; - default: - ret = -EINVAL; - } - return ret; -} -EXPORT_SYMBOL(vfio_register_notifier); - -int vfio_unregister_notifier(struct vfio_device *device, - enum vfio_notify_type type, - struct notifier_block *nb) -{ - struct vfio_group *group = device->group; - int ret; - - if (!nb || !vfio_assert_device_open(device)) - return -EINVAL; - - switch (type) { - case VFIO_IOMMU_NOTIFY: - ret = vfio_unregister_iommu_notifier(group, nb); - break; - default: - ret = -EINVAL; - } - return ret; -} -EXPORT_SYMBOL(vfio_unregister_notifier); - /* * Module/class support */ @@ -2159,13 +2089,17 @@ static int __init vfio_init(void) if (ret) goto err_alloc_chrdev; - pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); - #ifdef CONFIG_VFIO_NOIOMMU - vfio_register_iommu_driver(&vfio_noiommu_ops); + ret = vfio_register_iommu_driver(&vfio_noiommu_ops); #endif + if (ret) + goto err_driver_register; + + pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; +err_driver_register: + unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); err_alloc_chrdev: class_destroy(vfio.class); vfio.class = NULL; diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index a67130221151..503bea6c843d 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -50,16 +50,15 @@ struct vfio_iommu_driver_ops { struct iommu_group *group); int (*pin_pages)(void *iommu_data, struct iommu_group *group, - unsigned long *user_pfn, + dma_addr_t user_iova, int npage, int prot, - unsigned long *phys_pfn); - int (*unpin_pages)(void *iommu_data, - unsigned long *user_pfn, int npage); - int (*register_notifier)(void *iommu_data, - unsigned long *events, - struct notifier_block *nb); - int (*unregister_notifier)(void *iommu_data, - struct notifier_block *nb); + struct page **pages); + void (*unpin_pages)(void *iommu_data, + dma_addr_t user_iova, int npage); + void (*register_device)(void *iommu_data, + struct vfio_device *vdev); + void (*unregister_device)(void *iommu_data, + struct vfio_device *vdev); int (*dma_rw)(void *iommu_data, dma_addr_t user_iova, void *data, size_t count, bool write); struct iommu_domain *(*group_iommu_domain)(void *iommu_data, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 708a95e61831..169f07ac162d 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -378,8 +378,7 @@ static void tce_iommu_release(void *iommu_data) kfree(container); } -static void tce_iommu_unuse_page(struct tce_container *container, - unsigned long hpa) +static void tce_iommu_unuse_page(unsigned long hpa) { struct page *page; @@ -474,7 +473,7 @@ static int tce_iommu_clear(struct tce_container *container, continue; } - tce_iommu_unuse_page(container, oldhpa); + tce_iommu_unuse_page(oldhpa); } iommu_tce_kill(tbl, firstentry, pages); @@ -524,7 +523,7 @@ static long tce_iommu_build(struct tce_container *container, ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, &hpa, &dirtmp); if (ret) { - tce_iommu_unuse_page(container, hpa); + tce_iommu_unuse_page(hpa); pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", __func__, entry << tbl->it_page_shift, tce, ret); @@ -532,7 +531,7 @@ static long tce_iommu_build(struct tce_container *container, } if (dirtmp != DMA_NONE) - tce_iommu_unuse_page(container, hpa); + tce_iommu_unuse_page(hpa); tce += IOMMU_PAGE_SIZE(tbl); } @@ -1266,7 +1265,10 @@ static int tce_iommu_attach_group(void *iommu_data, goto unlock_exit; } - /* Check if new group has the same iommu_ops (i.e. compatible) */ + /* + * Check if new group has the same iommu_table_group_ops + * (i.e. compatible) + */ list_for_each_entry(tcegrp, &container->group_list, next) { struct iommu_table_group *table_group_tmp; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index c13b9290e357..db516c90a977 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -67,7 +67,8 @@ struct vfio_iommu { struct list_head iova_list; struct mutex lock; struct rb_root dma_list; - struct blocking_notifier_head notifier; + struct list_head device_list; + struct mutex device_list_lock; unsigned int dma_avail; unsigned int vaddr_invalid_count; uint64_t pgsize_bitmap; @@ -828,9 +829,9 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, static int vfio_iommu_type1_pin_pages(void *iommu_data, struct iommu_group *iommu_group, - unsigned long *user_pfn, + dma_addr_t user_iova, int npage, int prot, - unsigned long *phys_pfn) + struct page **pages) { struct vfio_iommu *iommu = iommu_data; struct vfio_iommu_group *group; @@ -840,7 +841,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, bool do_accounting; dma_addr_t iova; - if (!iommu || !user_pfn || !phys_pfn) + if (!iommu || !pages) return -EINVAL; /* Supported for v2 version only */ @@ -856,7 +857,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, again: if (iommu->vaddr_invalid_count) { for (i = 0; i < npage; i++) { - iova = user_pfn[i] << PAGE_SHIFT; + iova = user_iova + PAGE_SIZE * i; ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma); if (ret < 0) goto pin_done; @@ -865,8 +866,8 @@ again: } } - /* Fail if notifier list is empty */ - if (!iommu->notifier.head) { + /* Fail if no dma_umap notifier is registered */ + if (list_empty(&iommu->device_list)) { ret = -EINVAL; goto pin_done; } @@ -879,9 +880,10 @@ again: do_accounting = list_empty(&iommu->domain_list); for (i = 0; i < npage; i++) { + unsigned long phys_pfn; struct vfio_pfn *vpfn; - iova = user_pfn[i] << PAGE_SHIFT; + iova = user_iova + PAGE_SIZE * i; dma = vfio_find_dma(iommu, iova, PAGE_SIZE); if (!dma) { ret = -EINVAL; @@ -895,23 +897,25 @@ again: vpfn = vfio_iova_get_vfio_pfn(dma, iova); if (vpfn) { - phys_pfn[i] = vpfn->pfn; + pages[i] = pfn_to_page(vpfn->pfn); continue; } remote_vaddr = dma->vaddr + (iova - dma->iova); - ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i], + ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn, do_accounting); if (ret) goto pin_unwind; - ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]); + ret = vfio_add_to_pfn_list(dma, iova, phys_pfn); if (ret) { - if (put_pfn(phys_pfn[i], dma->prot) && do_accounting) + if (put_pfn(phys_pfn, dma->prot) && do_accounting) vfio_lock_acct(dma, -1, true); goto pin_unwind; } + pages[i] = pfn_to_page(phys_pfn); + if (iommu->dirty_page_tracking) { unsigned long pgshift = __ffs(iommu->pgsize_bitmap); @@ -934,43 +938,38 @@ again: goto pin_done; pin_unwind: - phys_pfn[i] = 0; + pages[i] = NULL; for (j = 0; j < i; j++) { dma_addr_t iova; - iova = user_pfn[j] << PAGE_SHIFT; + iova = user_iova + PAGE_SIZE * j; dma = vfio_find_dma(iommu, iova, PAGE_SIZE); vfio_unpin_page_external(dma, iova, do_accounting); - phys_pfn[j] = 0; + pages[j] = NULL; } pin_done: mutex_unlock(&iommu->lock); return ret; } -static int vfio_iommu_type1_unpin_pages(void *iommu_data, - unsigned long *user_pfn, - int npage) +static void vfio_iommu_type1_unpin_pages(void *iommu_data, + dma_addr_t user_iova, int npage) { struct vfio_iommu *iommu = iommu_data; bool do_accounting; int i; - if (!iommu || !user_pfn || npage <= 0) - return -EINVAL; - /* Supported for v2 version only */ - if (!iommu->v2) - return -EACCES; + if (WARN_ON(!iommu->v2)) + return; mutex_lock(&iommu->lock); do_accounting = list_empty(&iommu->domain_list); for (i = 0; i < npage; i++) { + dma_addr_t iova = user_iova + PAGE_SIZE * i; struct vfio_dma *dma; - dma_addr_t iova; - iova = user_pfn[i] << PAGE_SHIFT; dma = vfio_find_dma(iommu, iova, PAGE_SIZE); if (!dma) break; @@ -979,7 +978,8 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data, } mutex_unlock(&iommu->lock); - return i > 0 ? i : -EINVAL; + + WARN_ON(i != npage); } static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain, @@ -1287,6 +1287,35 @@ static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size) return 0; } +/* + * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate + * and unmap iovas within the range we're about to unmap. Drivers MUST unpin + * pages in response to an invalidation. + */ +static void vfio_notify_dma_unmap(struct vfio_iommu *iommu, + struct vfio_dma *dma) +{ + struct vfio_device *device; + + if (list_empty(&iommu->device_list)) + return; + + /* + * The device is expected to call vfio_unpin_pages() for any IOVA it has + * pinned within the range. Since vfio_unpin_pages() will eventually + * call back down to this code and try to obtain the iommu->lock we must + * drop it. + */ + mutex_lock(&iommu->device_list_lock); + mutex_unlock(&iommu->lock); + + list_for_each_entry(device, &iommu->device_list, iommu_entry) + device->ops->dma_unmap(device, dma->iova, dma->size); + + mutex_unlock(&iommu->device_list_lock); + mutex_lock(&iommu->lock); +} + static int vfio_dma_do_unmap(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_unmap *unmap, struct vfio_bitmap *bitmap) @@ -1377,12 +1406,6 @@ again: if (!iommu->v2 && iova > dma->iova) break; - /* - * Task with same address space who mapped this iova range is - * allowed to unmap the iova range. - */ - if (dma->task->mm != current->mm) - break; if (invalidate_vaddr) { if (dma->vaddr_invalid) { @@ -1406,8 +1429,6 @@ again: } if (!RB_EMPTY_ROOT(&dma->pfn_list)) { - struct vfio_iommu_type1_dma_unmap nb_unmap; - if (dma_last == dma) { BUG_ON(++retries > 10); } else { @@ -1415,20 +1436,7 @@ again: retries = 0; } - nb_unmap.iova = dma->iova; - nb_unmap.size = dma->size; - - /* - * Notify anyone (mdev vendor drivers) to invalidate and - * unmap iovas within the range we're about to unmap. - * Vendor drivers MUST unpin pages in response to an - * invalidation. - */ - mutex_unlock(&iommu->lock); - blocking_notifier_call_chain(&iommu->notifier, - VFIO_IOMMU_NOTIFY_DMA_UNMAP, - &nb_unmap); - mutex_lock(&iommu->lock); + vfio_notify_dma_unmap(iommu, dma); goto again; } @@ -1679,18 +1687,6 @@ out_unlock: return ret; } -static int vfio_bus_type(struct device *dev, void *data) -{ - struct bus_type **bus = data; - - if (*bus && *bus != dev->bus) - return -EINVAL; - - *bus = dev->bus; - - return 0; -} - static int vfio_iommu_replay(struct vfio_iommu *iommu, struct vfio_domain *domain) { @@ -2153,13 +2149,26 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, list_splice_tail(iova_copy, iova); } +/* Redundantly walks non-present capabilities to simplify caller */ +static int vfio_iommu_device_capable(struct device *dev, void *data) +{ + return device_iommu_capable(dev, (enum iommu_cap)data); +} + +static int vfio_iommu_domain_alloc(struct device *dev, void *data) +{ + struct iommu_domain **domain = data; + + *domain = iommu_domain_alloc(dev->bus); + return 1; /* Don't iterate */ +} + static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_group *iommu_group, enum vfio_group_type type) { struct vfio_iommu *iommu = iommu_data; struct vfio_iommu_group *group; struct vfio_domain *domain, *d; - struct bus_type *bus = NULL; bool resv_msi, msi_remap; phys_addr_t resv_msi_base = 0; struct iommu_domain_geometry *geo; @@ -2192,18 +2201,19 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_unlock; } - /* Determine bus_type in order to allocate a domain */ - ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type); - if (ret) - goto out_free_group; - ret = -ENOMEM; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) goto out_free_group; + /* + * Going via the iommu_group iterator avoids races, and trivially gives + * us a representative device for the IOMMU API call. We don't actually + * want to iterate beyond the first device (if any). + */ ret = -EIO; - domain->domain = iommu_domain_alloc(bus); + iommu_group_for_each_dev(iommu_group, &domain->domain, + vfio_iommu_domain_alloc); if (!domain->domain) goto out_free_domain; @@ -2258,7 +2268,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, list_add(&group->next, &domain->group_list); msi_remap = irq_domain_check_msi_remap() || - iommu_capable(bus, IOMMU_CAP_INTR_REMAP); + iommu_group_for_each_dev(iommu_group, (void *)IOMMU_CAP_INTR_REMAP, + vfio_iommu_device_capable); if (!allow_unsafe_interrupts && !msi_remap) { pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", @@ -2478,7 +2489,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, if (list_empty(&iommu->emulated_iommu_groups) && list_empty(&iommu->domain_list)) { - WARN_ON(iommu->notifier.head); + WARN_ON(!list_empty(&iommu->device_list)); vfio_iommu_unmap_unpin_all(iommu); } goto detach_group_done; @@ -2510,7 +2521,8 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, if (list_empty(&domain->group_list)) { if (list_is_singular(&iommu->domain_list)) { if (list_empty(&iommu->emulated_iommu_groups)) { - WARN_ON(iommu->notifier.head); + WARN_ON(!list_empty( + &iommu->device_list)); vfio_iommu_unmap_unpin_all(iommu); } else { vfio_iommu_unmap_unpin_reaccount(iommu); @@ -2571,7 +2583,8 @@ static void *vfio_iommu_type1_open(unsigned long arg) iommu->dma_avail = dma_entry_limit; iommu->container_open = true; mutex_init(&iommu->lock); - BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); + mutex_init(&iommu->device_list_lock); + INIT_LIST_HEAD(&iommu->device_list); init_waitqueue_head(&iommu->vaddr_wait); iommu->pgsize_bitmap = PAGE_MASK; INIT_LIST_HEAD(&iommu->emulated_iommu_groups); @@ -3008,28 +3021,40 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, } } -static int vfio_iommu_type1_register_notifier(void *iommu_data, - unsigned long *events, - struct notifier_block *nb) +static void vfio_iommu_type1_register_device(void *iommu_data, + struct vfio_device *vdev) { struct vfio_iommu *iommu = iommu_data; - /* clear known events */ - *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP; - - /* refuse to register if still events remaining */ - if (*events) - return -EINVAL; + if (!vdev->ops->dma_unmap) + return; - return blocking_notifier_chain_register(&iommu->notifier, nb); + /* + * list_empty(&iommu->device_list) is tested under the iommu->lock while + * iteration for dma_unmap must be done under the device_list_lock. + * Holding both locks here allows avoiding the device_list_lock in + * several fast paths. See vfio_notify_dma_unmap() + */ + mutex_lock(&iommu->lock); + mutex_lock(&iommu->device_list_lock); + list_add(&vdev->iommu_entry, &iommu->device_list); + mutex_unlock(&iommu->device_list_lock); + mutex_unlock(&iommu->lock); } -static int vfio_iommu_type1_unregister_notifier(void *iommu_data, - struct notifier_block *nb) +static void vfio_iommu_type1_unregister_device(void *iommu_data, + struct vfio_device *vdev) { struct vfio_iommu *iommu = iommu_data; - return blocking_notifier_chain_unregister(&iommu->notifier, nb); + if (!vdev->ops->dma_unmap) + return; + + mutex_lock(&iommu->lock); + mutex_lock(&iommu->device_list_lock); + list_del(&vdev->iommu_entry); + mutex_unlock(&iommu->device_list_lock); + mutex_unlock(&iommu->lock); } static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, @@ -3163,8 +3188,8 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { .detach_group = vfio_iommu_type1_detach_group, .pin_pages = vfio_iommu_type1_pin_pages, .unpin_pages = vfio_iommu_type1_unpin_pages, - .register_notifier = vfio_iommu_type1_register_notifier, - .unregister_notifier = vfio_iommu_type1_unregister_notifier, + .register_device = vfio_iommu_type1_register_device, + .unregister_device = vfio_iommu_type1_unregister_device, .dma_rw = vfio_iommu_type1_dma_rw, .group_iommu_domain = vfio_iommu_type1_group_iommu_domain, .notify = vfio_iommu_type1_notify, |