diff options
Diffstat (limited to 'drivers/vfio/pci')
-rw-r--r-- | drivers/vfio/pci/Kconfig | 2 | ||||
-rw-r--r-- | drivers/vfio/pci/Makefile | 2 | ||||
-rw-r--r-- | drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 2 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.c | 48 | ||||
-rw-r--r-- | drivers/vfio/pci/mlx5/main.c | 1 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/Kconfig | 19 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/Makefile | 11 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/cmds.c | 510 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/cmds.h | 25 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/dirty.c | 564 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/dirty.h | 39 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/lm.c | 434 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/lm.h | 41 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/pci_drv.c | 209 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/pci_drv.h | 9 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/vfio_dev.c | 227 | ||||
-rw-r--r-- | drivers/vfio/pci/pds/vfio_dev.h | 39 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 1 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_core.c | 261 |
19 files changed, 2291 insertions, 153 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 86bb7835cf3c..8125e5f37832 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -63,4 +63,6 @@ source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" +source "drivers/vfio/pci/pds/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 24c524224da5..45167be462d8 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -11,3 +11,5 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ + +obj-$(CONFIG_PDS_VFIO_PCI) += pds/ diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index a117eaf21c14..b2f9778c8366 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1373,6 +1373,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { @@ -1391,6 +1392,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c82c1f4fc588..33574b04477d 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -732,52 +732,6 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) mlx5vf_cmd_dealloc_pd(migf); } -static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, - u32 req_nodes) -{ - struct interval_tree_node *prev, *curr, *comb_start, *comb_end; - unsigned long min_gap; - unsigned long curr_gap; - - /* Special shortcut when a single range is required */ - if (req_nodes == 1) { - unsigned long last; - - curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); - while (curr) { - last = curr->last; - prev = curr; - curr = interval_tree_iter_next(curr, 0, ULONG_MAX); - if (prev != comb_start) - interval_tree_remove(prev, root); - } - comb_start->last = last; - return; - } - - /* Combine ranges which have the smallest gap */ - while (cur_nodes > req_nodes) { - prev = NULL; - min_gap = ULONG_MAX; - curr = interval_tree_iter_first(root, 0, ULONG_MAX); - while (curr) { - if (prev) { - curr_gap = curr->start - prev->last; - if (curr_gap < min_gap) { - min_gap = curr_gap; - comb_start = prev; - comb_end = curr; - } - } - prev = curr; - curr = interval_tree_iter_next(curr, 0, ULONG_MAX); - } - comb_start->last = comb_end->last; - interval_tree_remove(comb_end, root); - cur_nodes--; - } -} - static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, struct mlx5vf_pci_core_device *mvdev, struct rb_root_cached *ranges, u32 nnodes) @@ -800,7 +754,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, int i; if (num_ranges > max_num_range) { - combine_ranges(ranges, nnodes, max_num_range); + vfio_combine_iova_ranges(ranges, nnodes, max_num_range); num_ranges = max_num_range; } diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index d95fd382814c..42ec574a8622 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1320,6 +1320,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static int mlx5vf_pci_probe(struct pci_dev *pdev, diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig new file mode 100644 index 000000000000..407b3fd32733 --- /dev/null +++ b/drivers/vfio/pci/pds/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2023 Advanced Micro Devices, Inc. + +config PDS_VFIO_PCI + tristate "VFIO support for PDS PCI devices" + depends on PDS_CORE + select VFIO_PCI_CORE + help + This provides generic PCI support for PDS devices using the VFIO + framework. + + More specific information on this driver can be + found in + <file:Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst>. + + To compile this driver as a module, choose M here. The module + will be called pds-vfio-pci. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile new file mode 100644 index 000000000000..d5a06d81634f --- /dev/null +++ b/drivers/vfio/pci/pds/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2023 Advanced Micro Devices, Inc. + +obj-$(CONFIG_PDS_VFIO_PCI) += pds-vfio-pci.o + +pds-vfio-pci-y := \ + cmds.o \ + dirty.o \ + lm.o \ + pci_drv.o \ + vfio_dev.o diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c new file mode 100644 index 000000000000..36463ccc3df9 --- /dev/null +++ b/drivers/vfio/pci/pds/cmds.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/io.h> +#include <linux/types.h> +#include <linux/delay.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> + +#include "vfio_dev.h" +#include "cmds.h" + +#define SUSPEND_TIMEOUT_S 5 +#define SUSPEND_CHECK_INTERVAL_MS 1 + +static int pds_vfio_client_adminq_cmd(struct pds_vfio_pci_device *pds_vfio, + union pds_core_adminq_cmd *req, + union pds_core_adminq_comp *resp, + bool fast_poll) +{ + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + union pds_core_adminq_cmd cmd = {}; + struct pdsc *pdsc; + int err; + + /* Wrap the client request */ + cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD; + cmd.client_request.client_id = cpu_to_le16(pds_vfio->client_id); + memcpy(cmd.client_request.client_cmd, req, + sizeof(cmd.client_request.client_cmd)); + + pdsc = pdsc_get_pf_struct(pdev); + if (IS_ERR(pdsc)) + return PTR_ERR(pdsc); + + err = pdsc_adminq_post(pdsc, &cmd, resp, fast_poll); + if (err && err != -EAGAIN) + dev_err(pds_vfio_to_dev(pds_vfio), + "client admin cmd failed: %pe\n", ERR_PTR(err)); + + return err; +} + +int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + char devname[PDS_DEVNAME_LEN]; + struct pdsc *pdsc; + int ci; + + snprintf(devname, sizeof(devname), "%s.%d-%u", PDS_VFIO_LM_DEV_NAME, + pci_domain_nr(pdev->bus), + PCI_DEVID(pdev->bus->number, pdev->devfn)); + + pdsc = pdsc_get_pf_struct(pdev); + if (IS_ERR(pdsc)) + return PTR_ERR(pdsc); + + ci = pds_client_register(pdsc, devname); + if (ci < 0) + return ci; + + pds_vfio->client_id = ci; + + return 0; +} + +void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + struct pdsc *pdsc; + int err; + + pdsc = pdsc_get_pf_struct(pdev); + if (IS_ERR(pdsc)) + return; + + err = pds_client_unregister(pdsc, pds_vfio->client_id); + if (err) + dev_err(&pdev->dev, "unregister from DSC failed: %pe\n", + ERR_PTR(err)); + + pds_vfio->client_id = 0; +} + +static int +pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) +{ + union pds_core_adminq_cmd cmd = { + .lm_suspend_status = { + .opcode = PDS_LM_CMD_SUSPEND_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .type = type, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + unsigned long time_limit; + unsigned long time_start; + unsigned long time_done; + int err; + + time_start = jiffies; + time_limit = time_start + HZ * SUSPEND_TIMEOUT_S; + do { + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); + if (err != -EAGAIN) + break; + + msleep(SUSPEND_CHECK_INTERVAL_MS); + } while (time_before(jiffies, time_limit)); + + time_done = jiffies; + dev_dbg(dev, "%s: vf%u: Suspend comp received in %d msecs\n", __func__, + pds_vfio->vf_id, jiffies_to_msecs(time_done - time_start)); + + /* Check the results */ + if (time_after_eq(time_done, time_limit)) { + dev_err(dev, "%s: vf%u: Suspend comp timeout\n", __func__, + pds_vfio->vf_id); + err = -ETIMEDOUT; + } + + return err; +} + +int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) +{ + union pds_core_adminq_cmd cmd = { + .lm_suspend = { + .opcode = PDS_LM_CMD_SUSPEND, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .type = type, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Suspend device\n", pds_vfio->vf_id); + + /* + * The initial suspend request to the firmware starts the device suspend + * operation and the firmware returns success if it's started + * successfully. + */ + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); + if (err) { + dev_err(dev, "vf%u: Suspend failed: %pe\n", pds_vfio->vf_id, + ERR_PTR(err)); + return err; + } + + /* + * The subsequent suspend status request(s) check if the firmware has + * completed the device suspend process. + */ + return pds_vfio_suspend_wait_device_cmd(pds_vfio, type); +} + +int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) +{ + union pds_core_adminq_cmd cmd = { + .lm_resume = { + .opcode = PDS_LM_CMD_RESUME, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .type = type, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + + dev_dbg(dev, "vf%u: Resume device\n", pds_vfio->vf_id); + + return pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); +} + +int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size) +{ + union pds_core_adminq_cmd cmd = { + .lm_state_size = { + .opcode = PDS_LM_CMD_STATE_SIZE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Get migration status\n", pds_vfio->vf_id); + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) + return err; + + *size = le64_to_cpu(comp.lm_state_size.size); + return 0; +} + +static int pds_vfio_dma_map_lm_file(struct device *dev, + enum dma_data_direction dir, + struct pds_vfio_lm_file *lm_file) +{ + struct pds_lm_sg_elem *sgl, *sge; + struct scatterlist *sg; + dma_addr_t sgl_addr; + size_t sgl_size; + int err; + int i; + + if (!lm_file) + return -EINVAL; + + /* dma map file pages */ + err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0); + if (err) + return err; + + lm_file->num_sge = lm_file->sg_table.nents; + + /* alloc sgl */ + sgl_size = lm_file->num_sge * sizeof(struct pds_lm_sg_elem); + sgl = kzalloc(sgl_size, GFP_KERNEL); + if (!sgl) { + err = -ENOMEM; + goto out_unmap_sgtable; + } + + /* fill sgl */ + sge = sgl; + for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) { + sge->addr = cpu_to_le64(sg_dma_address(sg)); + sge->len = cpu_to_le32(sg_dma_len(sg)); + dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len); + sge++; + } + + sgl_addr = dma_map_single(dev, sgl, sgl_size, DMA_TO_DEVICE); + if (dma_mapping_error(dev, sgl_addr)) { + err = -EIO; + goto out_free_sgl; + } + + lm_file->sgl = sgl; + lm_file->sgl_addr = sgl_addr; + + return 0; + +out_free_sgl: + kfree(sgl); +out_unmap_sgtable: + lm_file->num_sge = 0; + dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0); + return err; +} + +static void pds_vfio_dma_unmap_lm_file(struct device *dev, + enum dma_data_direction dir, + struct pds_vfio_lm_file *lm_file) +{ + if (!lm_file) + return; + + /* free sgl */ + if (lm_file->sgl) { + dma_unmap_single(dev, lm_file->sgl_addr, + lm_file->num_sge * sizeof(*lm_file->sgl), + DMA_TO_DEVICE); + kfree(lm_file->sgl); + lm_file->sgl = NULL; + lm_file->sgl_addr = DMA_MAPPING_ERROR; + lm_file->num_sge = 0; + } + + /* dma unmap file pages */ + dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0); +} + +int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + union pds_core_adminq_cmd cmd = { + .lm_save = { + .opcode = PDS_LM_CMD_SAVE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + union pds_core_adminq_comp comp = {}; + struct pds_vfio_lm_file *lm_file; + int err; + + dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id); + + lm_file = pds_vfio->save_file; + + err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file); + if (err) { + dev_err(&pdev->dev, "failed to map save migration file: %pe\n", + ERR_PTR(err)); + return err; + } + + cmd.lm_save.sgl_addr = cpu_to_le64(lm_file->sgl_addr); + cmd.lm_save.num_sge = cpu_to_le32(lm_file->num_sge); + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) + dev_err(&pdev->dev, "failed to get migration state: %pe\n", + ERR_PTR(err)); + + pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file); + + return err; +} + +int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + union pds_core_adminq_cmd cmd = { + .lm_restore = { + .opcode = PDS_LM_CMD_RESTORE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + union pds_core_adminq_comp comp = {}; + struct pds_vfio_lm_file *lm_file; + int err; + + dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id); + + lm_file = pds_vfio->restore_file; + + err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file); + if (err) { + dev_err(&pdev->dev, + "failed to map restore migration file: %pe\n", + ERR_PTR(err)); + return err; + } + + cmd.lm_restore.sgl_addr = cpu_to_le64(lm_file->sgl_addr); + cmd.lm_restore.num_sge = cpu_to_le32(lm_file->num_sge); + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) + dev_err(&pdev->dev, "failed to set migration state: %pe\n", + ERR_PTR(err)); + + pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file); + + return err; +} + +void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, + enum pds_lm_host_vf_status vf_status) +{ + union pds_core_adminq_cmd cmd = { + .lm_host_vf_status = { + .opcode = PDS_LM_CMD_HOST_VF_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .status = vf_status, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Set host VF LM status: %u", pds_vfio->vf_id, + vf_status); + if (vf_status != PDS_LM_STA_IN_PROGRESS && + vf_status != PDS_LM_STA_NONE) { + dev_warn(dev, "Invalid host VF migration status, %d\n", + vf_status); + return; + } + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) + dev_warn(dev, "failed to send host VF migration status: %pe\n", + ERR_PTR(err)); +} + +int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 *max_regions, u8 *num_regions) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_status = { + .opcode = PDS_LM_CMD_DIRTY_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Dirty status\n", pds_vfio->vf_id); + + cmd.lm_dirty_status.regions_dma = cpu_to_le64(regions_dma); + cmd.lm_dirty_status.max_regions = *max_regions; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed to get dirty status: %pe\n", ERR_PTR(err)); + return err; + } + + /* only support seq_ack approach for now */ + if (!(le32_to_cpu(comp.lm_dirty_status.bmp_type_mask) & + BIT(PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK))) { + dev_err(dev, "Dirty bitmap tracking SEQ_ACK not supported\n"); + return -EOPNOTSUPP; + } + + *num_regions = comp.lm_dirty_status.num_regions; + *max_regions = comp.lm_dirty_status.max_regions; + + dev_dbg(dev, + "Page Tracking Status command successful, max_regions: %d, num_regions: %d, bmp_type: %s\n", + *max_regions, *num_regions, "PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK"); + + return 0; +} + +int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 num_regions) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_enable = { + .opcode = PDS_LM_CMD_DIRTY_ENABLE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .regions_dma = cpu_to_le64(regions_dma), + .bmp_type = PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK, + .num_regions = num_regions, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed dirty tracking enable: %pe\n", + ERR_PTR(err)); + return err; + } + + return 0; +} + +int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_disable = { + .opcode = PDS_LM_CMD_DIRTY_DISABLE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err || comp.lm_dirty_status.num_regions != 0) { + /* in case num_regions is still non-zero after disable */ + err = err ? err : -EIO; + dev_err(dev, + "failed dirty tracking disable: %pe, num_regions %d\n", + ERR_PTR(err), comp.lm_dirty_status.num_regions); + return err; + } + + return 0; +} + +int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 sgl_dma, u16 num_sge, u32 offset, + u32 total_len, bool read_seq) +{ + const char *cmd_type_str = read_seq ? "read_seq" : "write_ack"; + union pds_core_adminq_cmd cmd = { + .lm_dirty_seq_ack = { + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .len_bytes = cpu_to_le32(total_len), + .off_bytes = cpu_to_le32(offset), + .sgl_addr = cpu_to_le64(sgl_dma), + .num_sge = cpu_to_le16(num_sge), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + if (read_seq) + cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_READ_SEQ; + else + cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_WRITE_ACK; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed cmd Page Tracking %s: %pe\n", cmd_type_str, + ERR_PTR(err)); + return err; + } + + return 0; +} diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h new file mode 100644 index 000000000000..95221100b954 --- /dev/null +++ b/drivers/vfio/pci/pds/cmds.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _CMDS_H_ +#define _CMDS_H_ + +int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio); +int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type); +int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type); +int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size); +int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); +int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, + enum pds_lm_host_vf_status vf_status); +int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 *max_regions, + u8 *num_regions); +int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 num_regions); +int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio); +int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 sgl_dma, u16 num_sge, u32 offset, + u32 total_len, bool read_seq); +#endif /* _CMDS_H_ */ diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c new file mode 100644 index 000000000000..c937aa6f3954 --- /dev/null +++ b/drivers/vfio/pci/pds/dirty.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/interval_tree.h> +#include <linux/vfio.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> + +#include "vfio_dev.h" +#include "cmds.h" +#include "dirty.h" + +#define READ_SEQ true +#define WRITE_ACK false + +bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio) +{ + return pds_vfio->dirty.is_enabled; +} + +void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio) +{ + pds_vfio->dirty.is_enabled = true; +} + +void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio) +{ + pds_vfio->dirty.is_enabled = false; +} + +static void +pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio, + u8 max_regions) +{ + int len = max_regions * sizeof(struct pds_lm_dirty_region_info); + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_lm_dirty_region_info *region_info; + dma_addr_t regions_dma; + u8 num_regions; + int err; + + region_info = kcalloc(max_regions, + sizeof(struct pds_lm_dirty_region_info), + GFP_KERNEL); + if (!region_info) + return; + + regions_dma = + dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE); + if (dma_mapping_error(pdsc_dev, regions_dma)) + goto out_free_region_info; + + err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions, + &num_regions); + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE); + if (err) + goto out_free_region_info; + + for (unsigned int i = 0; i < num_regions; i++) + dev_dbg(&pdev->dev, + "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n", + i, le64_to_cpu(region_info[i].dma_base), + le32_to_cpu(region_info[i].page_count), + region_info[i].page_size_log2); + +out_free_region_info: + kfree(region_info); +} + +static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, + unsigned long bytes) +{ + unsigned long *host_seq_bmp, *host_ack_bmp; + + host_seq_bmp = vzalloc(bytes); + if (!host_seq_bmp) + return -ENOMEM; + + host_ack_bmp = vzalloc(bytes); + if (!host_ack_bmp) { + bitmap_free(host_seq_bmp); + return -ENOMEM; + } + + dirty->host_seq.bmp = host_seq_bmp; + dirty->host_ack.bmp = host_ack_bmp; + + return 0; +} + +static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) +{ + vfree(dirty->host_seq.bmp); + vfree(dirty->host_ack.bmp); + dirty->host_seq.bmp = NULL; + dirty->host_ack.bmp = NULL; +} + +static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + + dma_unmap_single(pdsc_dev, bmp_info->sgl_addr, + bmp_info->num_sge * sizeof(struct pds_lm_sg_elem), + DMA_BIDIRECTIONAL); + kfree(bmp_info->sgl); + + bmp_info->num_sge = 0; + bmp_info->sgl = NULL; + bmp_info->sgl_addr = 0; +} + +static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) +{ + if (pds_vfio->dirty.host_seq.sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq); + if (pds_vfio->dirty.host_ack.sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack); +} + +static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info, + u32 page_count) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_lm_sg_elem *sgl; + dma_addr_t sgl_addr; + size_t sgl_size; + u32 max_sge; + + max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8); + sgl_size = max_sge * sizeof(struct pds_lm_sg_elem); + + sgl = kzalloc(sgl_size, GFP_KERNEL); + if (!sgl) + return -ENOMEM; + + sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(pdsc_dev, sgl_addr)) { + kfree(sgl); + return -EIO; + } + + bmp_info->sgl = sgl; + bmp_info->num_sge = max_sge; + bmp_info->sgl_addr = sgl_addr; + + return 0; +} + +static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, + u32 page_count) +{ + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + int err; + + err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq, + page_count); + if (err) + return err; + + err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack, + page_count); + if (err) { + __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq); + return err; + } + + return 0; +} + +static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + u64 region_start, region_size, region_page_size; + struct pds_lm_dirty_region_info *region_info; + struct interval_tree_node *node = NULL; + u8 max_regions = 0, num_regions; + dma_addr_t regions_dma = 0; + u32 num_ranges = nnodes; + u32 page_count; + u16 len; + int err; + + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", + pds_vfio->vf_id); + + if (pds_vfio_dirty_is_enabled(pds_vfio)) + return -EINVAL; + + /* find if dirty tracking is disabled, i.e. num_regions == 0 */ + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, + &num_regions); + if (err < 0) { + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", + ERR_PTR(err)); + return err; + } else if (num_regions) { + dev_err(&pdev->dev, + "Dirty tracking already enabled for %d regions\n", + num_regions); + return -EEXIST; + } else if (!max_regions) { + dev_err(&pdev->dev, + "Device doesn't support dirty tracking, max_regions %d\n", + max_regions); + return -EOPNOTSUPP; + } + + /* + * Only support 1 region for now. If there are any large gaps in the + * VM's address regions, then this would be a waste of memory as we are + * generating 2 bitmaps (ack/seq) from the min address to the max + * address of the VM's address regions. In the future, if we support + * more than one region in the device/driver we can split the bitmaps + * on the largest address region gaps. We can do this split up to the + * max_regions times returned from the dirty_status command. + */ + max_regions = 1; + if (num_ranges > max_regions) { + vfio_combine_iova_ranges(ranges, nnodes, max_regions); + num_ranges = max_regions; + } + + node = interval_tree_iter_first(ranges, 0, ULONG_MAX); + if (!node) + return -EINVAL; + + region_size = node->last - node->start + 1; + region_start = node->start; + region_page_size = *page_size; + + len = sizeof(*region_info); + region_info = kzalloc(len, GFP_KERNEL); + if (!region_info) + return -ENOMEM; + + page_count = DIV_ROUND_UP(region_size, region_page_size); + + region_info->dma_base = cpu_to_le64(region_start); + region_info->page_count = cpu_to_le32(page_count); + region_info->page_size_log2 = ilog2(region_page_size); + + regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(pdsc_dev, regions_dma)) { + err = -ENOMEM; + goto out_free_region_info; + } + + err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions); + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL); + if (err) + goto out_free_region_info; + + /* + * page_count might be adjusted by the device, + * update it before freeing region_info DMA + */ + page_count = le32_to_cpu(region_info->page_count); + + dev_dbg(&pdev->dev, + "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n", + regions_dma, region_start, page_count, + (u8)ilog2(region_page_size)); + + err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE); + if (err) { + dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n", + ERR_PTR(err)); + goto out_free_region_info; + } + + err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count); + if (err) { + dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", + ERR_PTR(err)); + goto out_free_bitmaps; + } + + dirty->region_start = region_start; + dirty->region_size = region_size; + dirty->region_page_size = region_page_size; + pds_vfio_dirty_set_enabled(pds_vfio); + + pds_vfio_print_guest_region_info(pds_vfio, max_regions); + + kfree(region_info); + + return 0; + +out_free_bitmaps: + pds_vfio_dirty_free_bitmaps(dirty); +out_free_region_info: + kfree(region_info); + return err; +} + +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd) +{ + if (pds_vfio_dirty_is_enabled(pds_vfio)) { + pds_vfio_dirty_set_disabled(pds_vfio); + if (send_cmd) + pds_vfio_dirty_disable_cmd(pds_vfio); + pds_vfio_dirty_free_sgl(pds_vfio); + pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty); + } + + if (send_cmd) + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); +} + +static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info, + u32 offset, u32 bmp_bytes, bool read_seq) +{ + const char *bmp_type_str = read_seq ? "read_seq" : "write_ack"; + u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + unsigned long long npages; + struct sg_table sg_table; + struct scatterlist *sg; + struct page **pages; + u32 page_offset; + const void *bmp; + size_t size; + u16 num_sge; + int err; + int i; + + bmp = (void *)((u64)bmp_info->bmp + offset); + page_offset = offset_in_page(bmp); + bmp -= page_offset; + + /* + * Start and end of bitmap section to seq/ack might not be page + * aligned, so use the page_offset to account for that so there + * will be enough pages to represent the bmp_bytes + */ + npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE); + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (unsigned long long i = 0; i < npages; i++) { + struct page *page = vmalloc_to_page(bmp); + + if (!page) { + err = -EFAULT; + goto out_free_pages; + } + + pages[i] = page; + bmp += PAGE_SIZE; + } + + err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset, + bmp_bytes, GFP_KERNEL); + if (err) + goto out_free_pages; + + err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0); + if (err) + goto out_free_sg_table; + + for_each_sgtable_dma_sg(&sg_table, sg, i) { + struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i]; + + sg_elem->addr = cpu_to_le64(sg_dma_address(sg)); + sg_elem->len = cpu_to_le32(sg_dma_len(sg)); + } + + num_sge = sg_table.nents; + size = num_sge * sizeof(struct pds_lm_sg_elem); + dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); + err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge, + offset, bmp_bytes, read_seq); + if (err) + dev_err(&pdev->dev, + "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n", + bmp_type_str, offset, bmp_bytes, + num_sge, bmp_info->sgl_addr, ERR_PTR(err)); + dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); + + dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0); +out_free_sg_table: + sg_free_table(&sg_table); +out_free_pages: + kfree(pages); + + return err; +} + +static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio, + u32 offset, u32 len) +{ + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack, + offset, len, WRITE_ACK); +} + +static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio, + u32 offset, u32 len) +{ + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq, + offset, len, READ_SEQ); +} + +static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, + struct iova_bitmap *dirty_bitmap, + u32 bmp_offset, u32 len_bytes) +{ + u64 page_size = pds_vfio->dirty.region_page_size; + u64 region_start = pds_vfio->dirty.region_start; + u32 bmp_offset_bit; + __le64 *seq, *ack; + int dword_count; + + dword_count = len_bytes / sizeof(u64); + seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset); + ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset); + bmp_offset_bit = bmp_offset * 8; + + for (int i = 0; i < dword_count; i++) { + u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]); + + /* prepare for next write_ack call */ + ack[i] = seq[i]; + + for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) { + if (xor & BIT(bit_i)) { + u64 abs_bit_i = bmp_offset_bit + + i * BITS_PER_TYPE(u64) + bit_i; + u64 addr = abs_bit_i * page_size + region_start; + + iova_bitmap_set(dirty_bitmap, addr, page_size); + } + } + } + + return 0; +} + +static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, + struct iova_bitmap *dirty_bitmap, + unsigned long iova, unsigned long length) +{ + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + u64 bmp_offset, bmp_bytes; + u64 bitmap_size, pages; + int err; + + dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id); + + if (!pds_vfio_dirty_is_enabled(pds_vfio)) { + dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n", + pds_vfio->vf_id); + return -EINVAL; + } + + pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size); + bitmap_size = + round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE; + + dev_dbg(dev, + "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n", + pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size, + pages, bitmap_size); + + if (!length || ((dirty->region_start + iova + length) > + (dirty->region_start + dirty->region_size))) { + dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n", + iova, length); + return -EINVAL; + } + + /* bitmap is modified in 64 bit chunks */ + bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size, + sizeof(u64)), + sizeof(u64)); + if (bmp_bytes != bitmap_size) { + dev_err(dev, + "Calculated bitmap bytes %llu not equal to bitmap size %llu\n", + bmp_bytes, bitmap_size); + return -EINVAL; + } + + bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size, sizeof(u64)); + + dev_dbg(dev, + "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n", + iova, length, bmp_offset, bmp_bytes); + + err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes); + if (err) + return err; + + err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset, + bmp_bytes); + if (err) + return err; + + err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes); + if (err) + return err; + + return 0; +} + +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, + unsigned long length, struct iova_bitmap *dirty) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + int err; + + mutex_lock(&pds_vfio->state_mutex); + err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length); + pds_vfio_state_mutex_unlock(pds_vfio); + + return err; +} + +int pds_vfio_dma_logging_start(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + int err; + + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); + err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size); + pds_vfio_state_mutex_unlock(pds_vfio); + + return err; +} + +int pds_vfio_dma_logging_stop(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_dirty_disable(pds_vfio, true); + pds_vfio_state_mutex_unlock(pds_vfio); + + return 0; +} diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h new file mode 100644 index 000000000000..f78da25d75ca --- /dev/null +++ b/drivers/vfio/pci/pds/dirty.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _DIRTY_H_ +#define _DIRTY_H_ + +struct pds_vfio_bmp_info { + unsigned long *bmp; + u32 bmp_bytes; + struct pds_lm_sg_elem *sgl; + dma_addr_t sgl_addr; + u16 num_sge; +}; + +struct pds_vfio_dirty { + struct pds_vfio_bmp_info host_seq; + struct pds_vfio_bmp_info host_ack; + u64 region_size; + u64 region_start; + u64 region_page_size; + bool is_enabled; +}; + +struct pds_vfio_pci_device; + +bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, + bool send_cmd); + +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, + unsigned long length, + struct iova_bitmap *dirty); +int pds_vfio_dma_logging_start(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size); +int pds_vfio_dma_logging_stop(struct vfio_device *vdev); +#endif /* _DIRTY_H_ */ diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c new file mode 100644 index 000000000000..79fe2e66bb49 --- /dev/null +++ b/drivers/vfio/pci/pds/lm.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_core.h> + +#include "vfio_dev.h" +#include "cmds.h" + +static struct pds_vfio_lm_file * +pds_vfio_get_lm_file(const struct file_operations *fops, int flags, u64 size) +{ + struct pds_vfio_lm_file *lm_file = NULL; + unsigned long long npages; + struct page **pages; + void *page_mem; + const void *p; + + if (!size) + return NULL; + + /* Alloc file structure */ + lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL); + if (!lm_file) + return NULL; + + /* Create file */ + lm_file->filep = + anon_inode_getfile("pds_vfio_lm", fops, lm_file, flags); + if (IS_ERR(lm_file->filep)) + goto out_free_file; + + stream_open(lm_file->filep->f_inode, lm_file->filep); + mutex_init(&lm_file->lock); + + /* prevent file from being released before we are done with it */ + get_file(lm_file->filep); + + /* Allocate memory for file pages */ + npages = DIV_ROUND_UP_ULL(size, PAGE_SIZE); + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out_put_file; + + page_mem = kvzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL); + if (!page_mem) + goto out_free_pages_array; + + p = page_mem - offset_in_page(page_mem); + for (unsigned long long i = 0; i < npages; i++) { + if (is_vmalloc_addr(p)) + pages[i] = vmalloc_to_page(p); + else + pages[i] = kmap_to_page((void *)p); + if (!pages[i]) + goto out_free_page_mem; + + p += PAGE_SIZE; + } + + /* Create scatterlist of file pages to use for DMA mapping later */ + if (sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages, 0, + size, GFP_KERNEL)) + goto out_free_page_mem; + + lm_file->size = size; + lm_file->pages = pages; + lm_file->npages = npages; + lm_file->page_mem = page_mem; + lm_file->alloc_size = npages * PAGE_SIZE; + + return lm_file; + +out_free_page_mem: + kvfree(page_mem); +out_free_pages_array: + kfree(pages); +out_put_file: + fput(lm_file->filep); + mutex_destroy(&lm_file->lock); +out_free_file: + kfree(lm_file); + + return NULL; +} + +static void pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file) +{ + mutex_lock(&lm_file->lock); + + lm_file->size = 0; + lm_file->alloc_size = 0; + + /* Free scatter list of file pages */ + sg_free_table(&lm_file->sg_table); + + kvfree(lm_file->page_mem); + lm_file->page_mem = NULL; + kfree(lm_file->pages); + lm_file->pages = NULL; + + mutex_unlock(&lm_file->lock); + + /* allow file to be released since we are done with it */ + fput(lm_file->filep); +} + +void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio) +{ + if (!pds_vfio->save_file) + return; + + pds_vfio_put_lm_file(pds_vfio->save_file); + pds_vfio->save_file = NULL; +} + +void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio) +{ + if (!pds_vfio->restore_file) + return; + + pds_vfio_put_lm_file(pds_vfio->restore_file); + pds_vfio->restore_file = NULL; +} + +static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file, + unsigned long offset) +{ + unsigned long cur_offset = 0; + struct scatterlist *sg; + unsigned int i; + + /* All accesses are sequential */ + if (offset < lm_file->last_offset || !lm_file->last_offset_sg) { + lm_file->last_offset = 0; + lm_file->last_offset_sg = lm_file->sg_table.sgl; + lm_file->sg_last_entry = 0; + } + + cur_offset = lm_file->last_offset; + + for_each_sg(lm_file->last_offset_sg, sg, + lm_file->sg_table.orig_nents - lm_file->sg_last_entry, i) { + if (offset < sg->length + cur_offset) { + lm_file->last_offset_sg = sg; + lm_file->sg_last_entry += i; + lm_file->last_offset = cur_offset; + return nth_page(sg_page(sg), + (offset - cur_offset) / PAGE_SIZE); + } + cur_offset += sg->length; + } + + return NULL; +} + +static int pds_vfio_release_file(struct inode *inode, struct file *filp) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + + mutex_lock(&lm_file->lock); + lm_file->filep->f_pos = 0; + lm_file->size = 0; + mutex_unlock(&lm_file->lock); + mutex_destroy(&lm_file->lock); + kfree(lm_file); + + return 0; +} + +static ssize_t pds_vfio_save_read(struct file *filp, char __user *buf, + size_t len, loff_t *pos) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + ssize_t done = 0; + + if (pos) + return -ESPIPE; + pos = &filp->f_pos; + + mutex_lock(&lm_file->lock); + if (*pos > lm_file->size) { + done = -EINVAL; + goto out_unlock; + } + + len = min_t(size_t, lm_file->size - *pos, len); + while (len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int err; + + page_offset = (*pos) % PAGE_SIZE; + page = pds_vfio_get_file_page(lm_file, *pos - page_offset); + if (!page) { + if (done == 0) + done = -EINVAL; + goto out_unlock; + } + + page_len = min_t(size_t, len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + err = copy_to_user(buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (err) { + done = -EFAULT; + goto out_unlock; + } + *pos += page_len; + len -= page_len; + done += page_len; + buf += page_len; + } + +out_unlock: + mutex_unlock(&lm_file->lock); + return done; +} + +static const struct file_operations pds_vfio_save_fops = { + .owner = THIS_MODULE, + .read = pds_vfio_save_read, + .release = pds_vfio_release_file, + .llseek = no_llseek, +}; + +static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio) +{ + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; + struct pds_vfio_lm_file *lm_file; + u64 size; + int err; + + /* Get live migration state size in this state */ + err = pds_vfio_get_lm_state_size_cmd(pds_vfio, &size); + if (err) { + dev_err(dev, "failed to get save status: %pe\n", ERR_PTR(err)); + return err; + } + + dev_dbg(dev, "save status, size = %lld\n", size); + + if (!size) { + dev_err(dev, "invalid state size\n"); + return -EIO; + } + + lm_file = pds_vfio_get_lm_file(&pds_vfio_save_fops, O_RDONLY, size); + if (!lm_file) { + dev_err(dev, "failed to create save file\n"); + return -ENOENT; + } + + dev_dbg(dev, "size = %lld, alloc_size = %lld, npages = %lld\n", + lm_file->size, lm_file->alloc_size, lm_file->npages); + + pds_vfio->save_file = lm_file; + + return 0; +} + +static ssize_t pds_vfio_restore_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + loff_t requested_length; + ssize_t done = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + + if (*pos < 0 || + check_add_overflow((loff_t)len, *pos, &requested_length)) + return -EINVAL; + + mutex_lock(&lm_file->lock); + + while (len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int err; + + page_offset = (*pos) % PAGE_SIZE; + page = pds_vfio_get_file_page(lm_file, *pos - page_offset); + if (!page) { + if (done == 0) + done = -EINVAL; + goto out_unlock; + } + + page_len = min_t(size_t, len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + err = copy_from_user(to_buff + page_offset, buf, page_len); + kunmap_local(to_buff); + if (err) { + done = -EFAULT; + goto out_unlock; + } + *pos += page_len; + len -= page_len; + done += page_len; + buf += page_len; + lm_file->size += page_len; + } +out_unlock: + mutex_unlock(&lm_file->lock); + return done; +} + +static const struct file_operations pds_vfio_restore_fops = { + .owner = THIS_MODULE, + .write = pds_vfio_restore_write, + .release = pds_vfio_release_file, + .llseek = no_llseek, +}; + +static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio) +{ + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; + struct pds_vfio_lm_file *lm_file; + u64 size; + + size = sizeof(union pds_lm_dev_state); + dev_dbg(dev, "restore status, size = %lld\n", size); + + if (!size) { + dev_err(dev, "invalid state size"); + return -EIO; + } + + lm_file = pds_vfio_get_lm_file(&pds_vfio_restore_fops, O_WRONLY, size); + if (!lm_file) { + dev_err(dev, "failed to create restore file"); + return -ENOENT; + } + pds_vfio->restore_file = lm_file; + + return 0; +} + +struct file * +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, + enum vfio_device_mig_state next) +{ + enum vfio_device_mig_state cur = pds_vfio->state; + int err; + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) { + err = pds_vfio_get_save_file(pds_vfio); + if (err) + return ERR_PTR(err); + + err = pds_vfio_get_lm_state_cmd(pds_vfio); + if (err) { + pds_vfio_put_save_file(pds_vfio); + return ERR_PTR(err); + } + + return pds_vfio->save_file->filep; + } + + if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) { + pds_vfio_put_save_file(pds_vfio); + pds_vfio_dirty_disable(pds_vfio, true); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) { + err = pds_vfio_get_restore_file(pds_vfio); + if (err) + return ERR_PTR(err); + + return pds_vfio->restore_file->filep; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) { + err = pds_vfio_set_lm_state_cmd(pds_vfio); + if (err) + return ERR_PTR(err); + + pds_vfio_put_restore_file(pds_vfio); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) { + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, + PDS_LM_STA_IN_PROGRESS); + err = pds_vfio_suspend_device_cmd(pds_vfio, + PDS_LM_SUSPEND_RESUME_TYPE_P2P); + if (err) + return ERR_PTR(err); + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) { + err = pds_vfio_resume_device_cmd(pds_vfio, + PDS_LM_SUSPEND_RESUME_TYPE_FULL); + if (err) + return ERR_PTR(err); + + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P) { + err = pds_vfio_resume_device_cmd(pds_vfio, + PDS_LM_SUSPEND_RESUME_TYPE_P2P); + if (err) + return ERR_PTR(err); + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP) { + err = pds_vfio_suspend_device_cmd(pds_vfio, + PDS_LM_SUSPEND_RESUME_TYPE_FULL); + if (err) + return ERR_PTR(err); + return NULL; + } + + return ERR_PTR(-EINVAL); +} diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h new file mode 100644 index 000000000000..13be893198b7 --- /dev/null +++ b/drivers/vfio/pci/pds/lm.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _LM_H_ +#define _LM_H_ + +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/scatterlist.h> +#include <linux/types.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_adminq.h> + +struct pds_vfio_lm_file { + struct file *filep; + struct mutex lock; /* protect live migration data file */ + u64 size; /* Size with valid data */ + u64 alloc_size; /* Total allocated size. Always >= len */ + void *page_mem; /* memory allocated for pages */ + struct page **pages; /* Backing pages for file */ + unsigned long long npages; + struct sg_table sg_table; /* SG table for backing pages */ + struct pds_lm_sg_elem *sgl; /* DMA mapping */ + dma_addr_t sgl_addr; + u16 num_sge; + struct scatterlist *last_offset_sg; /* Iterator */ + unsigned int sg_last_entry; + unsigned long last_offset; +}; + +struct pds_vfio_pci_device; + +struct file * +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, + enum vfio_device_mig_state next); + +void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio); + +#endif /* _LM_H_ */ diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c new file mode 100644 index 000000000000..ab4b5958e413 --- /dev/null +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/types.h> +#include <linux/vfio.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> + +#include "vfio_dev.h" +#include "pci_drv.h" +#include "cmds.h" + +#define PDS_VFIO_DRV_DESCRIPTION "AMD/Pensando VFIO Device Driver" +#define PCI_VENDOR_ID_PENSANDO 0x1dd8 + +static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio) +{ + bool deferred_reset_needed = false; + + /* + * Documentation states that the kernel migration driver must not + * generate asynchronous device state transitions outside of + * manipulation by the user or the VFIO_DEVICE_RESET ioctl. + * + * Since recovery is an asynchronous event received from the device, + * initiate a deferred reset. Issue a deferred reset in the following + * situations: + * 1. Migration is in progress, which will cause the next step of + * the migration to fail. + * 2. If the device is in a state that will be set to + * VFIO_DEVICE_STATE_RUNNING on the next action (i.e. VM is + * shutdown and device is in VFIO_DEVICE_STATE_STOP). + */ + mutex_lock(&pds_vfio->state_mutex); + if ((pds_vfio->state != VFIO_DEVICE_STATE_RUNNING && + pds_vfio->state != VFIO_DEVICE_STATE_ERROR) || + (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING && + pds_vfio_dirty_is_enabled(pds_vfio))) + deferred_reset_needed = true; + mutex_unlock(&pds_vfio->state_mutex); + + /* + * On the next user initiated state transition, the device will + * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's + * responsibility to reset the device. + * + * If a VFIO_DEVICE_RESET is requested post recovery and before the next + * state transition, then the deferred reset state will be set to + * VFIO_DEVICE_STATE_RUNNING. + */ + if (deferred_reset_needed) { + spin_lock(&pds_vfio->reset_lock); + pds_vfio->deferred_reset = true; + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR; + spin_unlock(&pds_vfio->reset_lock); + } +} + +static int pds_vfio_pci_notify_handler(struct notifier_block *nb, + unsigned long ecode, void *data) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(nb, struct pds_vfio_pci_device, nb); + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_notifyq_comp *event = data; + + dev_dbg(dev, "%s: event code %lu\n", __func__, ecode); + + /* + * We don't need to do anything for RESET state==0 as there is no notify + * or feedback mechanism available, and it is possible that we won't + * even see a state==0 event since the pds_core recovery is pending. + * + * Any requests from VFIO while state==0 will fail, which will return + * error and may cause migration to fail. + */ + if (ecode == PDS_EVENT_RESET) { + dev_info(dev, "%s: PDS_EVENT_RESET event received, state==%d\n", + __func__, event->reset.state); + /* + * pds_core device finished recovery and sent us the + * notification (state == 1) to allow us to recover + */ + if (event->reset.state == 1) + pds_vfio_recovery(pds_vfio); + } + + return 0; +} + +static int +pds_vfio_pci_register_event_handler(struct pds_vfio_pci_device *pds_vfio) +{ + struct device *dev = pds_vfio_to_dev(pds_vfio); + struct notifier_block *nb = &pds_vfio->nb; + int err; + + if (!nb->notifier_call) { + nb->notifier_call = pds_vfio_pci_notify_handler; + err = pdsc_register_notify(nb); + if (err) { + nb->notifier_call = NULL; + dev_err(dev, + "failed to register pds event handler: %pe\n", + ERR_PTR(err)); + return -EINVAL; + } + dev_dbg(dev, "pds event handler registered\n"); + } + + return 0; +} + +static void +pds_vfio_pci_unregister_event_handler(struct pds_vfio_pci_device *pds_vfio) +{ + if (pds_vfio->nb.notifier_call) { + pdsc_unregister_notify(&pds_vfio->nb); + pds_vfio->nb.notifier_call = NULL; + } +} + +static int pds_vfio_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct pds_vfio_pci_device *pds_vfio; + int err; + + pds_vfio = vfio_alloc_device(pds_vfio_pci_device, vfio_coredev.vdev, + &pdev->dev, pds_vfio_ops_info()); + if (IS_ERR(pds_vfio)) + return PTR_ERR(pds_vfio); + + dev_set_drvdata(&pdev->dev, &pds_vfio->vfio_coredev); + + err = vfio_pci_core_register_device(&pds_vfio->vfio_coredev); + if (err) + goto out_put_vdev; + + err = pds_vfio_register_client_cmd(pds_vfio); + if (err) { + dev_err(&pdev->dev, "failed to register as client: %pe\n", + ERR_PTR(err)); + goto out_unregister_coredev; + } + + err = pds_vfio_pci_register_event_handler(pds_vfio); + if (err) + goto out_unregister_client; + + return 0; + +out_unregister_client: + pds_vfio_unregister_client_cmd(pds_vfio); +out_unregister_coredev: + vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev); +out_put_vdev: + vfio_put_device(&pds_vfio->vfio_coredev.vdev); + return err; +} + +static void pds_vfio_pci_remove(struct pci_dev *pdev) +{ + struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); + + pds_vfio_pci_unregister_event_handler(pds_vfio); + pds_vfio_unregister_client_cmd(pds_vfio); + vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev); + vfio_put_device(&pds_vfio->vfio_coredev.vdev); +} + +static const struct pci_device_id pds_vfio_pci_table[] = { + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_PENSANDO, 0x1003) }, /* Ethernet VF */ + { 0, } +}; +MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table); + +static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev) +{ + struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); + + pds_vfio_reset(pds_vfio); +} + +static const struct pci_error_handlers pds_vfio_pci_err_handlers = { + .reset_done = pds_vfio_pci_aer_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static struct pci_driver pds_vfio_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = pds_vfio_pci_table, + .probe = pds_vfio_pci_probe, + .remove = pds_vfio_pci_remove, + .err_handler = &pds_vfio_pci_err_handlers, + .driver_managed_dma = true, +}; + +module_pci_driver(pds_vfio_pci_driver); + +MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION); +MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/vfio/pci/pds/pci_drv.h b/drivers/vfio/pci/pds/pci_drv.h new file mode 100644 index 000000000000..e79bed12ed14 --- /dev/null +++ b/drivers/vfio/pci/pds/pci_drv.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _PCI_DRV_H +#define _PCI_DRV_H + +#include <linux/pci.h> + +#endif /* _PCI_DRV_H */ diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c new file mode 100644 index 000000000000..b46174f5eb09 --- /dev/null +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/vfio.h> +#include <linux/vfio_pci_core.h> + +#include "lm.h" +#include "dirty.h" +#include "vfio_dev.h" + +struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio) +{ + return pds_vfio->vfio_coredev.pdev; +} + +struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio) +{ + return &pds_vfio_to_pci_dev(pds_vfio)->dev; +} + +struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + return container_of(core_device, struct pds_vfio_pci_device, + vfio_coredev); +} + +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) +{ +again: + spin_lock(&pds_vfio->reset_lock); + if (pds_vfio->deferred_reset) { + pds_vfio->deferred_reset = false; + if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) { + pds_vfio_put_restore_file(pds_vfio); + pds_vfio_put_save_file(pds_vfio); + pds_vfio_dirty_disable(pds_vfio, false); + } + pds_vfio->state = pds_vfio->deferred_reset_state; + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; + spin_unlock(&pds_vfio->reset_lock); + goto again; + } + mutex_unlock(&pds_vfio->state_mutex); + spin_unlock(&pds_vfio->reset_lock); +} + +void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio) +{ + spin_lock(&pds_vfio->reset_lock); + pds_vfio->deferred_reset = true; + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; + if (!mutex_trylock(&pds_vfio->state_mutex)) { + spin_unlock(&pds_vfio->reset_lock); + return; + } + spin_unlock(&pds_vfio->reset_lock); + pds_vfio_state_mutex_unlock(pds_vfio); +} + +static struct file * +pds_vfio_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + struct file *res = NULL; + + mutex_lock(&pds_vfio->state_mutex); + /* + * only way to transition out of VFIO_DEVICE_STATE_ERROR is via + * VFIO_DEVICE_RESET, so prevent the state machine from running since + * vfio_mig_get_next_state() will throw a WARN_ON() when transitioning + * from VFIO_DEVICE_STATE_ERROR to any other state + */ + while (pds_vfio->state != VFIO_DEVICE_STATE_ERROR && + new_state != pds_vfio->state) { + enum vfio_device_mig_state next_state; + + int err = vfio_mig_get_next_state(vdev, pds_vfio->state, + new_state, &next_state); + if (err) { + res = ERR_PTR(err); + break; + } + + res = pds_vfio_step_device_state_locked(pds_vfio, next_state); + if (IS_ERR(res)) + break; + + pds_vfio->state = next_state; + + if (WARN_ON(res && new_state != pds_vfio->state)) { + res = ERR_PTR(-EINVAL); + break; + } + } + pds_vfio_state_mutex_unlock(pds_vfio); + /* still waiting on a deferred_reset */ + if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) + res = ERR_PTR(-EIO); + + return res; +} + +static int pds_vfio_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *current_state) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_lock(&pds_vfio->state_mutex); + *current_state = pds_vfio->state; + pds_vfio_state_mutex_unlock(pds_vfio); + return 0; +} + +static int pds_vfio_get_device_state_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + *stop_copy_length = PDS_LM_DEVICE_STATE_LENGTH; + return 0; +} + +static const struct vfio_migration_ops pds_vfio_lm_ops = { + .migration_set_state = pds_vfio_set_device_state, + .migration_get_state = pds_vfio_get_device_state, + .migration_get_data_size = pds_vfio_get_device_state_size +}; + +static const struct vfio_log_ops pds_vfio_log_ops = { + .log_start = pds_vfio_dma_logging_start, + .log_stop = pds_vfio_dma_logging_stop, + .log_read_and_clear = pds_vfio_dma_logging_report, +}; + +static int pds_vfio_init_device(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + struct pci_dev *pdev = to_pci_dev(vdev->dev); + int err, vf_id, pci_id; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + err = vfio_pci_core_init_dev(vdev); + if (err) + return err; + + pds_vfio->vf_id = vf_id; + + vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; + vdev->mig_ops = &pds_vfio_lm_ops; + vdev->log_ops = &pds_vfio_log_ops; + + pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn); + dev_dbg(&pdev->dev, + "%s: PF %#04x VF %#04x vf_id %d domain %d pds_vfio %p\n", + __func__, pci_dev_id(pdev->physfn), pci_id, vf_id, + pci_domain_nr(pdev->bus), pds_vfio); + + return 0; +} + +static int pds_vfio_open_device(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + int err; + + err = vfio_pci_core_enable(&pds_vfio->vfio_coredev); + if (err) + return err; + + mutex_init(&pds_vfio->state_mutex); + pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; + + vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev); + + return 0; +} + +static void pds_vfio_close_device(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_put_restore_file(pds_vfio); + pds_vfio_put_save_file(pds_vfio); + pds_vfio_dirty_disable(pds_vfio, true); + mutex_unlock(&pds_vfio->state_mutex); + mutex_destroy(&pds_vfio->state_mutex); + vfio_pci_core_close_device(vdev); +} + +static const struct vfio_device_ops pds_vfio_ops = { + .name = "pds-vfio", + .init = pds_vfio_init_device, + .release = vfio_pci_core_release_dev, + .open_device = pds_vfio_open_device, + .close_device = pds_vfio_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, +}; + +const struct vfio_device_ops *pds_vfio_ops_info(void) +{ + return &pds_vfio_ops; +} diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h new file mode 100644 index 000000000000..b8f2d667608f --- /dev/null +++ b/drivers/vfio/pci/pds/vfio_dev.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _VFIO_DEV_H_ +#define _VFIO_DEV_H_ + +#include <linux/pci.h> +#include <linux/vfio_pci_core.h> + +#include "dirty.h" +#include "lm.h" + +struct pds_vfio_pci_device { + struct vfio_pci_core_device vfio_coredev; + + struct pds_vfio_lm_file *save_file; + struct pds_vfio_lm_file *restore_file; + struct pds_vfio_dirty dirty; + struct mutex state_mutex; /* protect migration state */ + enum vfio_device_mig_state state; + spinlock_t reset_lock; /* protect reset_done flow */ + u8 deferred_reset; + enum vfio_device_mig_state deferred_reset_state; + struct notifier_block nb; + + int vf_id; + u16 client_id; +}; + +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio); + +const struct vfio_device_ops *pds_vfio_ops_info(void); +struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev); +void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio); + +struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio); +struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio); + +#endif /* _VFIO_DEV_H_ */ diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 29091ee2e984..cb5b7f865d58 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -141,6 +141,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 20d7b69ea6ff..1929103ee59a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,6 +27,7 @@ #include <linux/vgaarb.h> #include <linux/nospec.h> #include <linux/sched/mm.h> +#include <linux/iommufd.h> #if IS_ENABLED(CONFIG_EEH) #include <asm/eeh.h> #endif @@ -180,7 +181,8 @@ no_mmap: struct vfio_pci_group_info; static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, - struct vfio_pci_group_info *groups); + struct vfio_pci_group_info *groups, + struct iommufd_ctx *iommufd_ctx); /* * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND @@ -776,29 +778,65 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) } struct vfio_pci_fill_info { - int max; - int cur; - struct vfio_pci_dependent_device *devices; + struct vfio_pci_dependent_device __user *devices; + struct vfio_pci_dependent_device __user *devices_end; + struct vfio_device *vdev; + u32 count; + u32 flags; }; static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) { + struct vfio_pci_dependent_device info = { + .segment = pci_domain_nr(pdev->bus), + .bus = pdev->bus->number, + .devfn = pdev->devfn, + }; struct vfio_pci_fill_info *fill = data; - struct iommu_group *iommu_group; - if (fill->cur == fill->max) - return -EAGAIN; /* Something changed, try again */ + fill->count++; + if (fill->devices >= fill->devices_end) + return 0; + + if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { + struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); + struct vfio_device_set *dev_set = fill->vdev->dev_set; + struct vfio_device *vdev; + + /* + * hot-reset requires all affected devices be represented in + * the dev_set. + */ + vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); + if (!vdev) { + info.devid = VFIO_PCI_DEVID_NOT_OWNED; + } else { + int id = vfio_iommufd_get_dev_id(vdev, iommufd); + + if (id > 0) + info.devid = id; + else if (id == -ENOENT) + info.devid = VFIO_PCI_DEVID_OWNED; + else + info.devid = VFIO_PCI_DEVID_NOT_OWNED; + } + /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ + if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) + fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; + } else { + struct iommu_group *iommu_group; + + iommu_group = iommu_group_get(&pdev->dev); + if (!iommu_group) + return -EPERM; /* Cannot reset non-isolated devices */ - iommu_group = iommu_group_get(&pdev->dev); - if (!iommu_group) - return -EPERM; /* Cannot reset non-isolated devices */ + info.group_id = iommu_group_id(iommu_group); + iommu_group_put(iommu_group); + } - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); - fill->devices[fill->cur].bus = pdev->bus->number; - fill->devices[fill->cur].devfn = pdev->devfn; - fill->cur++; - iommu_group_put(iommu_group); + if (copy_to_user(fill->devices, &info, sizeof(info))) + return -EFAULT; + fill->devices++; return 0; } @@ -920,24 +958,17 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, struct vfio_device_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); - struct vfio_device_info info; + struct vfio_device_info info = {}; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz; int ret; - /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (info.argsz >= capsz) { - minsz = capsz; - info.cap_offset = 0; - } + minsz = min_t(size_t, info.argsz, sizeof(info)); info.flags = VFIO_DEVICE_FLAGS_PCI; @@ -1228,8 +1259,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); struct vfio_pci_hot_reset_info hdr; - struct vfio_pci_fill_info fill = { 0 }; - struct vfio_pci_dependent_device *devices = NULL; + struct vfio_pci_fill_info fill = {}; bool slot = false; int ret = 0; @@ -1247,78 +1277,42 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( else if (pci_probe_reset_bus(vdev->pdev->bus)) return -ENODEV; - /* How many devices are affected? */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, - &fill.max, slot); - if (ret) - return ret; + fill.devices = arg->devices; + fill.devices_end = arg->devices + + (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); + fill.vdev = &vdev->vdev; - WARN_ON(!fill.max); /* Should always be at least one */ - - /* - * If there's enough space, fill it now, otherwise return -ENOSPC and - * the number of devices affected. - */ - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { - ret = -ENOSPC; - hdr.count = fill.max; - goto reset_info_exit; - } - - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; - - fill.devices = devices; + if (vfio_device_cdev_opened(&vdev->vdev)) + fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID | + VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; + mutex_lock(&vdev->vdev.dev_set->lock); ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, &fill, slot); + mutex_unlock(&vdev->vdev.dev_set->lock); + if (ret) + return ret; - /* - * If a device was removed between counting and filling, we may come up - * short of fill.max. If a device was added, we'll have a return of - * -EAGAIN above. - */ - if (!ret) - hdr.count = fill.cur; - -reset_info_exit: + hdr.count = fill.count; + hdr.flags = fill.flags; if (copy_to_user(arg, &hdr, minsz)) - ret = -EFAULT; - - if (!ret) { - if (copy_to_user(&arg->devices, devices, - hdr.count * sizeof(*devices))) - ret = -EFAULT; - } + return -EFAULT; - kfree(devices); - return ret; + if (fill.count > fill.devices - arg->devices) + return -ENOSPC; + return 0; } -static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, - struct vfio_pci_hot_reset __user *arg) +static int +vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev, + int array_count, bool slot, + struct vfio_pci_hot_reset __user *arg) { - unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); - struct vfio_pci_hot_reset hdr; int32_t *group_fds; struct file **files; struct vfio_pci_group_info info; - bool slot = false; int file_idx, count = 0, ret = 0; - if (copy_from_user(&hdr, arg, minsz)) - return -EFAULT; - - if (hdr.argsz < minsz || hdr.flags) - return -EINVAL; - - /* Can we do a slot or bus reset or neither? */ - if (!pci_probe_reset_slot(vdev->pdev->slot)) - slot = true; - else if (pci_probe_reset_bus(vdev->pdev->bus)) - return -ENODEV; - /* * We can't let userspace give us an arbitrarily large buffer to copy, * so verify how many we think there could be. Note groups can have @@ -1329,12 +1323,11 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, if (ret) return ret; - /* Somewhere between 1 and count is OK */ - if (!hdr.count || hdr.count > count) + if (array_count > count) return -EINVAL; - group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); - files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); + group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL); + files = kcalloc(array_count, sizeof(*files), GFP_KERNEL); if (!group_fds || !files) { kfree(group_fds); kfree(files); @@ -1342,18 +1335,17 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, } if (copy_from_user(group_fds, arg->group_fds, - hdr.count * sizeof(*group_fds))) { + array_count * sizeof(*group_fds))) { kfree(group_fds); kfree(files); return -EFAULT; } /* - * For each group_fd, get the group through the vfio external user - * interface and store the group and iommu ID. This ensures the group - * is held across the reset. + * Get the group file for each fd to ensure the group is held across + * the reset */ - for (file_idx = 0; file_idx < hdr.count; file_idx++) { + for (file_idx = 0; file_idx < array_count; file_idx++) { struct file *file = fget(group_fds[file_idx]); if (!file) { @@ -1377,10 +1369,10 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, if (ret) goto hot_reset_release; - info.count = hdr.count; + info.count = array_count; info.files = files; - ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL); hot_reset_release: for (file_idx--; file_idx >= 0; file_idx--) @@ -1390,6 +1382,36 @@ hot_reset_release: return ret; } +static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, + struct vfio_pci_hot_reset __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); + struct vfio_pci_hot_reset hdr; + bool slot = false; + + if (copy_from_user(&hdr, arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.flags) + return -EINVAL; + + /* zero-length array is only for cdev opened devices */ + if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev)) + return -EINVAL; + + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; + + if (hdr.count) + return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg); + + return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, + vfio_iommufd_device_ictx(&vdev->vdev)); +} + static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, struct vfio_device_ioeventfd __user *arg) { @@ -2355,13 +2377,16 @@ const struct pci_error_handlers vfio_pci_core_err_handlers = { }; EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers); -static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, +static bool vfio_dev_in_groups(struct vfio_device *vdev, struct vfio_pci_group_info *groups) { unsigned int i; + if (!groups) + return false; + for (i = 0; i < groups->count; i++) - if (vfio_file_has_dev(groups->files[i], &vdev->vdev)) + if (vfio_file_has_dev(groups->files[i], vdev)) return true; return false; } @@ -2369,12 +2394,8 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) { struct vfio_device_set *dev_set = data; - struct vfio_device *cur; - list_for_each_entry(cur, &dev_set->device_list, dev_set_list) - if (cur->dev == &pdev->dev) - return 0; - return -EBUSY; + return vfio_find_device_in_devset(dev_set, &pdev->dev) ? 0 : -ENODEV; } /* @@ -2441,7 +2462,8 @@ unwind: * get each memory_lock. */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, - struct vfio_pci_group_info *groups) + struct vfio_pci_group_info *groups, + struct iommufd_ctx *iommufd_ctx) { struct vfio_pci_core_device *cur_mem; struct vfio_pci_core_device *cur_vma; @@ -2471,11 +2493,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, goto err_unlock; list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + bool owned; + /* - * Test whether all the affected devices are contained by the - * set of groups provided by the user. + * Test whether all the affected devices can be reset by the + * user. + * + * If called from a group opened device and the user provides + * a set of groups, all the devices in the dev_set should be + * contained by the set of groups provided by the user. + * + * If called from a cdev opened device and the user provides + * a zero-length array, all the devices in the dev_set must + * be bound to the same iommufd_ctx as the input iommufd_ctx. + * If there is any device that has not been bound to any + * iommufd_ctx yet, check if its iommu_group has any device + * bound to the input iommufd_ctx. Such devices can be + * considered owned by the input iommufd_ctx as the device + * cannot be owned by another iommufd_ctx when its iommu_group + * is owned. + * + * Otherwise, reset is not allowed. */ - if (!vfio_dev_in_groups(cur_vma, groups)) { + if (iommufd_ctx) { + int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + iommufd_ctx); + + owned = (devid > 0 || devid == -ENOENT); + } else { + owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + } + + if (!owned) { ret = -EINVAL; goto err_undo; } |