summaryrefslogtreecommitdiff
path: root/drivers/vfio/pci
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio/pci')
-rw-r--r--drivers/vfio/pci/Kconfig2
-rw-r--r--drivers/vfio/pci/Makefile2
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c2
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c48
-rw-r--r--drivers/vfio/pci/mlx5/main.c1
-rw-r--r--drivers/vfio/pci/pds/Kconfig19
-rw-r--r--drivers/vfio/pci/pds/Makefile11
-rw-r--r--drivers/vfio/pci/pds/cmds.c510
-rw-r--r--drivers/vfio/pci/pds/cmds.h25
-rw-r--r--drivers/vfio/pci/pds/dirty.c564
-rw-r--r--drivers/vfio/pci/pds/dirty.h39
-rw-r--r--drivers/vfio/pci/pds/lm.c434
-rw-r--r--drivers/vfio/pci/pds/lm.h41
-rw-r--r--drivers/vfio/pci/pds/pci_drv.c209
-rw-r--r--drivers/vfio/pci/pds/pci_drv.h9
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.c227
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.h39
-rw-r--r--drivers/vfio/pci/vfio_pci.c1
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c261
19 files changed, 2291 insertions, 153 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 86bb7835cf3c..8125e5f37832 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -63,4 +63,6 @@ source "drivers/vfio/pci/mlx5/Kconfig"
source "drivers/vfio/pci/hisilicon/Kconfig"
+source "drivers/vfio/pci/pds/Kconfig"
+
endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 24c524224da5..45167be462d8 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -11,3 +11,5 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/
obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
+
+obj-$(CONFIG_PDS_VFIO_PCI) += pds/
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index a117eaf21c14..b2f9778c8366 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -1373,6 +1373,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = {
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
@@ -1391,6 +1392,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index c82c1f4fc588..33574b04477d 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -732,52 +732,6 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
mlx5vf_cmd_dealloc_pd(migf);
}
-static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes,
- u32 req_nodes)
-{
- struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
- unsigned long min_gap;
- unsigned long curr_gap;
-
- /* Special shortcut when a single range is required */
- if (req_nodes == 1) {
- unsigned long last;
-
- curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
- while (curr) {
- last = curr->last;
- prev = curr;
- curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
- if (prev != comb_start)
- interval_tree_remove(prev, root);
- }
- comb_start->last = last;
- return;
- }
-
- /* Combine ranges which have the smallest gap */
- while (cur_nodes > req_nodes) {
- prev = NULL;
- min_gap = ULONG_MAX;
- curr = interval_tree_iter_first(root, 0, ULONG_MAX);
- while (curr) {
- if (prev) {
- curr_gap = curr->start - prev->last;
- if (curr_gap < min_gap) {
- min_gap = curr_gap;
- comb_start = prev;
- comb_end = curr;
- }
- }
- prev = curr;
- curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
- }
- comb_start->last = comb_end->last;
- interval_tree_remove(comb_end, root);
- cur_nodes--;
- }
-}
-
static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
struct mlx5vf_pci_core_device *mvdev,
struct rb_root_cached *ranges, u32 nnodes)
@@ -800,7 +754,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
int i;
if (num_ranges > max_num_range) {
- combine_ranges(ranges, nnodes, max_num_range);
+ vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
num_ranges = max_num_range;
}
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index d95fd382814c..42ec574a8622 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1320,6 +1320,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static int mlx5vf_pci_probe(struct pci_dev *pdev,
diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig
new file mode 100644
index 000000000000..407b3fd32733
--- /dev/null
+++ b/drivers/vfio/pci/pds/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+
+config PDS_VFIO_PCI
+ tristate "VFIO support for PDS PCI devices"
+ depends on PDS_CORE
+ select VFIO_PCI_CORE
+ help
+ This provides generic PCI support for PDS devices using the VFIO
+ framework.
+
+ More specific information on this driver can be
+ found in
+ <file:Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst>.
+
+ To compile this driver as a module, choose M here. The module
+ will be called pds-vfio-pci.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile
new file mode 100644
index 000000000000..d5a06d81634f
--- /dev/null
+++ b/drivers/vfio/pci/pds/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+
+obj-$(CONFIG_PDS_VFIO_PCI) += pds-vfio-pci.o
+
+pds-vfio-pci-y := \
+ cmds.o \
+ dirty.o \
+ lm.o \
+ pci_drv.o \
+ vfio_dev.o
diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c
new file mode 100644
index 000000000000..36463ccc3df9
--- /dev/null
+++ b/drivers/vfio/pci/pds/cmds.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+
+#define SUSPEND_TIMEOUT_S 5
+#define SUSPEND_CHECK_INTERVAL_MS 1
+
+static int pds_vfio_client_adminq_cmd(struct pds_vfio_pci_device *pds_vfio,
+ union pds_core_adminq_cmd *req,
+ union pds_core_adminq_comp *resp,
+ bool fast_poll)
+{
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ union pds_core_adminq_cmd cmd = {};
+ struct pdsc *pdsc;
+ int err;
+
+ /* Wrap the client request */
+ cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD;
+ cmd.client_request.client_id = cpu_to_le16(pds_vfio->client_id);
+ memcpy(cmd.client_request.client_cmd, req,
+ sizeof(cmd.client_request.client_cmd));
+
+ pdsc = pdsc_get_pf_struct(pdev);
+ if (IS_ERR(pdsc))
+ return PTR_ERR(pdsc);
+
+ err = pdsc_adminq_post(pdsc, &cmd, resp, fast_poll);
+ if (err && err != -EAGAIN)
+ dev_err(pds_vfio_to_dev(pds_vfio),
+ "client admin cmd failed: %pe\n", ERR_PTR(err));
+
+ return err;
+}
+
+int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ char devname[PDS_DEVNAME_LEN];
+ struct pdsc *pdsc;
+ int ci;
+
+ snprintf(devname, sizeof(devname), "%s.%d-%u", PDS_VFIO_LM_DEV_NAME,
+ pci_domain_nr(pdev->bus),
+ PCI_DEVID(pdev->bus->number, pdev->devfn));
+
+ pdsc = pdsc_get_pf_struct(pdev);
+ if (IS_ERR(pdsc))
+ return PTR_ERR(pdsc);
+
+ ci = pds_client_register(pdsc, devname);
+ if (ci < 0)
+ return ci;
+
+ pds_vfio->client_id = ci;
+
+ return 0;
+}
+
+void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ struct pdsc *pdsc;
+ int err;
+
+ pdsc = pdsc_get_pf_struct(pdev);
+ if (IS_ERR(pdsc))
+ return;
+
+ err = pds_client_unregister(pdsc, pds_vfio->client_id);
+ if (err)
+ dev_err(&pdev->dev, "unregister from DSC failed: %pe\n",
+ ERR_PTR(err));
+
+ pds_vfio->client_id = 0;
+}
+
+static int
+pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_suspend_status = {
+ .opcode = PDS_LM_CMD_SUSPEND_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .type = type,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ unsigned long time_limit;
+ unsigned long time_start;
+ unsigned long time_done;
+ int err;
+
+ time_start = jiffies;
+ time_limit = time_start + HZ * SUSPEND_TIMEOUT_S;
+ do {
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+ if (err != -EAGAIN)
+ break;
+
+ msleep(SUSPEND_CHECK_INTERVAL_MS);
+ } while (time_before(jiffies, time_limit));
+
+ time_done = jiffies;
+ dev_dbg(dev, "%s: vf%u: Suspend comp received in %d msecs\n", __func__,
+ pds_vfio->vf_id, jiffies_to_msecs(time_done - time_start));
+
+ /* Check the results */
+ if (time_after_eq(time_done, time_limit)) {
+ dev_err(dev, "%s: vf%u: Suspend comp timeout\n", __func__,
+ pds_vfio->vf_id);
+ err = -ETIMEDOUT;
+ }
+
+ return err;
+}
+
+int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_suspend = {
+ .opcode = PDS_LM_CMD_SUSPEND,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .type = type,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "vf%u: Suspend device\n", pds_vfio->vf_id);
+
+ /*
+ * The initial suspend request to the firmware starts the device suspend
+ * operation and the firmware returns success if it's started
+ * successfully.
+ */
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+ if (err) {
+ dev_err(dev, "vf%u: Suspend failed: %pe\n", pds_vfio->vf_id,
+ ERR_PTR(err));
+ return err;
+ }
+
+ /*
+ * The subsequent suspend status request(s) check if the firmware has
+ * completed the device suspend process.
+ */
+ return pds_vfio_suspend_wait_device_cmd(pds_vfio, type);
+}
+
+int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_resume = {
+ .opcode = PDS_LM_CMD_RESUME,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .type = type,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+
+ dev_dbg(dev, "vf%u: Resume device\n", pds_vfio->vf_id);
+
+ return pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true);
+}
+
+int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_state_size = {
+ .opcode = PDS_LM_CMD_STATE_SIZE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "vf%u: Get migration status\n", pds_vfio->vf_id);
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err)
+ return err;
+
+ *size = le64_to_cpu(comp.lm_state_size.size);
+ return 0;
+}
+
+static int pds_vfio_dma_map_lm_file(struct device *dev,
+ enum dma_data_direction dir,
+ struct pds_vfio_lm_file *lm_file)
+{
+ struct pds_lm_sg_elem *sgl, *sge;
+ struct scatterlist *sg;
+ dma_addr_t sgl_addr;
+ size_t sgl_size;
+ int err;
+ int i;
+
+ if (!lm_file)
+ return -EINVAL;
+
+ /* dma map file pages */
+ err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0);
+ if (err)
+ return err;
+
+ lm_file->num_sge = lm_file->sg_table.nents;
+
+ /* alloc sgl */
+ sgl_size = lm_file->num_sge * sizeof(struct pds_lm_sg_elem);
+ sgl = kzalloc(sgl_size, GFP_KERNEL);
+ if (!sgl) {
+ err = -ENOMEM;
+ goto out_unmap_sgtable;
+ }
+
+ /* fill sgl */
+ sge = sgl;
+ for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) {
+ sge->addr = cpu_to_le64(sg_dma_address(sg));
+ sge->len = cpu_to_le32(sg_dma_len(sg));
+ dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len);
+ sge++;
+ }
+
+ sgl_addr = dma_map_single(dev, sgl, sgl_size, DMA_TO_DEVICE);
+ if (dma_mapping_error(dev, sgl_addr)) {
+ err = -EIO;
+ goto out_free_sgl;
+ }
+
+ lm_file->sgl = sgl;
+ lm_file->sgl_addr = sgl_addr;
+
+ return 0;
+
+out_free_sgl:
+ kfree(sgl);
+out_unmap_sgtable:
+ lm_file->num_sge = 0;
+ dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+ return err;
+}
+
+static void pds_vfio_dma_unmap_lm_file(struct device *dev,
+ enum dma_data_direction dir,
+ struct pds_vfio_lm_file *lm_file)
+{
+ if (!lm_file)
+ return;
+
+ /* free sgl */
+ if (lm_file->sgl) {
+ dma_unmap_single(dev, lm_file->sgl_addr,
+ lm_file->num_sge * sizeof(*lm_file->sgl),
+ DMA_TO_DEVICE);
+ kfree(lm_file->sgl);
+ lm_file->sgl = NULL;
+ lm_file->sgl_addr = DMA_MAPPING_ERROR;
+ lm_file->num_sge = 0;
+ }
+
+ /* dma unmap file pages */
+ dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0);
+}
+
+int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_save = {
+ .opcode = PDS_LM_CMD_SAVE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ union pds_core_adminq_comp comp = {};
+ struct pds_vfio_lm_file *lm_file;
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id);
+
+ lm_file = pds_vfio->save_file;
+
+ err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file);
+ if (err) {
+ dev_err(&pdev->dev, "failed to map save migration file: %pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ cmd.lm_save.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+ cmd.lm_save.num_sge = cpu_to_le32(lm_file->num_sge);
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err)
+ dev_err(&pdev->dev, "failed to get migration state: %pe\n",
+ ERR_PTR(err));
+
+ pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file);
+
+ return err;
+}
+
+int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_restore = {
+ .opcode = PDS_LM_CMD_RESTORE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio);
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ union pds_core_adminq_comp comp = {};
+ struct pds_vfio_lm_file *lm_file;
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id);
+
+ lm_file = pds_vfio->restore_file;
+
+ err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file);
+ if (err) {
+ dev_err(&pdev->dev,
+ "failed to map restore migration file: %pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ cmd.lm_restore.sgl_addr = cpu_to_le64(lm_file->sgl_addr);
+ cmd.lm_restore.num_sge = cpu_to_le32(lm_file->num_sge);
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err)
+ dev_err(&pdev->dev, "failed to set migration state: %pe\n",
+ ERR_PTR(err));
+
+ pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file);
+
+ return err;
+}
+
+void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ enum pds_lm_host_vf_status vf_status)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_host_vf_status = {
+ .opcode = PDS_LM_CMD_HOST_VF_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .status = vf_status,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "vf%u: Set host VF LM status: %u", pds_vfio->vf_id,
+ vf_status);
+ if (vf_status != PDS_LM_STA_IN_PROGRESS &&
+ vf_status != PDS_LM_STA_NONE) {
+ dev_warn(dev, "Invalid host VF migration status, %d\n",
+ vf_status);
+ return;
+ }
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err)
+ dev_warn(dev, "failed to send host VF migration status: %pe\n",
+ ERR_PTR(err));
+}
+
+int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 regions_dma, u8 *max_regions, u8 *num_regions)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_dirty_status = {
+ .opcode = PDS_LM_CMD_DIRTY_STATUS,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ dev_dbg(dev, "vf%u: Dirty status\n", pds_vfio->vf_id);
+
+ cmd.lm_dirty_status.regions_dma = cpu_to_le64(regions_dma);
+ cmd.lm_dirty_status.max_regions = *max_regions;
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err) {
+ dev_err(dev, "failed to get dirty status: %pe\n", ERR_PTR(err));
+ return err;
+ }
+
+ /* only support seq_ack approach for now */
+ if (!(le32_to_cpu(comp.lm_dirty_status.bmp_type_mask) &
+ BIT(PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK))) {
+ dev_err(dev, "Dirty bitmap tracking SEQ_ACK not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ *num_regions = comp.lm_dirty_status.num_regions;
+ *max_regions = comp.lm_dirty_status.max_regions;
+
+ dev_dbg(dev,
+ "Page Tracking Status command successful, max_regions: %d, num_regions: %d, bmp_type: %s\n",
+ *max_regions, *num_regions, "PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK");
+
+ return 0;
+}
+
+int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 regions_dma, u8 num_regions)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_dirty_enable = {
+ .opcode = PDS_LM_CMD_DIRTY_ENABLE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .regions_dma = cpu_to_le64(regions_dma),
+ .bmp_type = PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK,
+ .num_regions = num_regions,
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err) {
+ dev_err(dev, "failed dirty tracking enable: %pe\n",
+ ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+}
+
+int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio)
+{
+ union pds_core_adminq_cmd cmd = {
+ .lm_dirty_disable = {
+ .opcode = PDS_LM_CMD_DIRTY_DISABLE,
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err || comp.lm_dirty_status.num_regions != 0) {
+ /* in case num_regions is still non-zero after disable */
+ err = err ? err : -EIO;
+ dev_err(dev,
+ "failed dirty tracking disable: %pe, num_regions %d\n",
+ ERR_PTR(err), comp.lm_dirty_status.num_regions);
+ return err;
+ }
+
+ return 0;
+}
+
+int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 sgl_dma, u16 num_sge, u32 offset,
+ u32 total_len, bool read_seq)
+{
+ const char *cmd_type_str = read_seq ? "read_seq" : "write_ack";
+ union pds_core_adminq_cmd cmd = {
+ .lm_dirty_seq_ack = {
+ .vf_id = cpu_to_le16(pds_vfio->vf_id),
+ .len_bytes = cpu_to_le32(total_len),
+ .off_bytes = cpu_to_le32(offset),
+ .sgl_addr = cpu_to_le64(sgl_dma),
+ .num_sge = cpu_to_le16(num_sge),
+ },
+ };
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_adminq_comp comp = {};
+ int err;
+
+ if (read_seq)
+ cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_READ_SEQ;
+ else
+ cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_WRITE_ACK;
+
+ err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false);
+ if (err) {
+ dev_err(dev, "failed cmd Page Tracking %s: %pe\n", cmd_type_str,
+ ERR_PTR(err));
+ return err;
+ }
+
+ return 0;
+}
diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h
new file mode 100644
index 000000000000..95221100b954
--- /dev/null
+++ b/drivers/vfio/pci/pds/cmds.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _CMDS_H_
+#define _CMDS_H_
+
+int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type);
+int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type);
+int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size);
+int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ enum pds_lm_host_vf_status vf_status);
+int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 regions_dma, u8 *max_regions,
+ u8 *num_regions);
+int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 regions_dma, u8 num_regions);
+int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio);
+int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio,
+ u64 sgl_dma, u16 num_sge, u32 offset,
+ u32 total_len, bool read_seq);
+#endif /* _CMDS_H_ */
diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
new file mode 100644
index 000000000000..c937aa6f3954
--- /dev/null
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/interval_tree.h>
+#include <linux/vfio.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+#include "dirty.h"
+
+#define READ_SEQ true
+#define WRITE_ACK false
+
+bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio)
+{
+ return pds_vfio->dirty.is_enabled;
+}
+
+void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio)
+{
+ pds_vfio->dirty.is_enabled = true;
+}
+
+void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio)
+{
+ pds_vfio->dirty.is_enabled = false;
+}
+
+static void
+pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio,
+ u8 max_regions)
+{
+ int len = max_regions * sizeof(struct pds_lm_dirty_region_info);
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ struct pds_lm_dirty_region_info *region_info;
+ dma_addr_t regions_dma;
+ u8 num_regions;
+ int err;
+
+ region_info = kcalloc(max_regions,
+ sizeof(struct pds_lm_dirty_region_info),
+ GFP_KERNEL);
+ if (!region_info)
+ return;
+
+ regions_dma =
+ dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE);
+ if (dma_mapping_error(pdsc_dev, regions_dma))
+ goto out_free_region_info;
+
+ err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions,
+ &num_regions);
+ dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE);
+ if (err)
+ goto out_free_region_info;
+
+ for (unsigned int i = 0; i < num_regions; i++)
+ dev_dbg(&pdev->dev,
+ "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n",
+ i, le64_to_cpu(region_info[i].dma_base),
+ le32_to_cpu(region_info[i].page_count),
+ region_info[i].page_size_log2);
+
+out_free_region_info:
+ kfree(region_info);
+}
+
+static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty,
+ unsigned long bytes)
+{
+ unsigned long *host_seq_bmp, *host_ack_bmp;
+
+ host_seq_bmp = vzalloc(bytes);
+ if (!host_seq_bmp)
+ return -ENOMEM;
+
+ host_ack_bmp = vzalloc(bytes);
+ if (!host_ack_bmp) {
+ bitmap_free(host_seq_bmp);
+ return -ENOMEM;
+ }
+
+ dirty->host_seq.bmp = host_seq_bmp;
+ dirty->host_ack.bmp = host_ack_bmp;
+
+ return 0;
+}
+
+static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty)
+{
+ vfree(dirty->host_seq.bmp);
+ vfree(dirty->host_ack.bmp);
+ dirty->host_seq.bmp = NULL;
+ dirty->host_ack.bmp = NULL;
+}
+
+static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio,
+ struct pds_vfio_bmp_info *bmp_info)
+{
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+
+ dma_unmap_single(pdsc_dev, bmp_info->sgl_addr,
+ bmp_info->num_sge * sizeof(struct pds_lm_sg_elem),
+ DMA_BIDIRECTIONAL);
+ kfree(bmp_info->sgl);
+
+ bmp_info->num_sge = 0;
+ bmp_info->sgl = NULL;
+ bmp_info->sgl_addr = 0;
+}
+
+static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (pds_vfio->dirty.host_seq.sgl)
+ __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq);
+ if (pds_vfio->dirty.host_ack.sgl)
+ __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack);
+}
+
+static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
+ struct pds_vfio_bmp_info *bmp_info,
+ u32 page_count)
+{
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ struct pds_lm_sg_elem *sgl;
+ dma_addr_t sgl_addr;
+ size_t sgl_size;
+ u32 max_sge;
+
+ max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8);
+ sgl_size = max_sge * sizeof(struct pds_lm_sg_elem);
+
+ sgl = kzalloc(sgl_size, GFP_KERNEL);
+ if (!sgl)
+ return -ENOMEM;
+
+ sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(pdsc_dev, sgl_addr)) {
+ kfree(sgl);
+ return -EIO;
+ }
+
+ bmp_info->sgl = sgl;
+ bmp_info->num_sge = max_sge;
+ bmp_info->sgl_addr = sgl_addr;
+
+ return 0;
+}
+
+static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio,
+ u32 page_count)
+{
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+ int err;
+
+ err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq,
+ page_count);
+ if (err)
+ return err;
+
+ err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack,
+ page_count);
+ if (err) {
+ __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq);
+ return err;
+ }
+
+ return 0;
+}
+
+static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio,
+ struct rb_root_cached *ranges, u32 nnodes,
+ u64 *page_size)
+{
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+ u64 region_start, region_size, region_page_size;
+ struct pds_lm_dirty_region_info *region_info;
+ struct interval_tree_node *node = NULL;
+ u8 max_regions = 0, num_regions;
+ dma_addr_t regions_dma = 0;
+ u32 num_ranges = nnodes;
+ u32 page_count;
+ u16 len;
+ int err;
+
+ dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n",
+ pds_vfio->vf_id);
+
+ if (pds_vfio_dirty_is_enabled(pds_vfio))
+ return -EINVAL;
+
+ /* find if dirty tracking is disabled, i.e. num_regions == 0 */
+ err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions,
+ &num_regions);
+ if (err < 0) {
+ dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n",
+ ERR_PTR(err));
+ return err;
+ } else if (num_regions) {
+ dev_err(&pdev->dev,
+ "Dirty tracking already enabled for %d regions\n",
+ num_regions);
+ return -EEXIST;
+ } else if (!max_regions) {
+ dev_err(&pdev->dev,
+ "Device doesn't support dirty tracking, max_regions %d\n",
+ max_regions);
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Only support 1 region for now. If there are any large gaps in the
+ * VM's address regions, then this would be a waste of memory as we are
+ * generating 2 bitmaps (ack/seq) from the min address to the max
+ * address of the VM's address regions. In the future, if we support
+ * more than one region in the device/driver we can split the bitmaps
+ * on the largest address region gaps. We can do this split up to the
+ * max_regions times returned from the dirty_status command.
+ */
+ max_regions = 1;
+ if (num_ranges > max_regions) {
+ vfio_combine_iova_ranges(ranges, nnodes, max_regions);
+ num_ranges = max_regions;
+ }
+
+ node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
+ if (!node)
+ return -EINVAL;
+
+ region_size = node->last - node->start + 1;
+ region_start = node->start;
+ region_page_size = *page_size;
+
+ len = sizeof(*region_info);
+ region_info = kzalloc(len, GFP_KERNEL);
+ if (!region_info)
+ return -ENOMEM;
+
+ page_count = DIV_ROUND_UP(region_size, region_page_size);
+
+ region_info->dma_base = cpu_to_le64(region_start);
+ region_info->page_count = cpu_to_le32(page_count);
+ region_info->page_size_log2 = ilog2(region_page_size);
+
+ regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(pdsc_dev, regions_dma)) {
+ err = -ENOMEM;
+ goto out_free_region_info;
+ }
+
+ err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions);
+ dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL);
+ if (err)
+ goto out_free_region_info;
+
+ /*
+ * page_count might be adjusted by the device,
+ * update it before freeing region_info DMA
+ */
+ page_count = le32_to_cpu(region_info->page_count);
+
+ dev_dbg(&pdev->dev,
+ "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n",
+ regions_dma, region_start, page_count,
+ (u8)ilog2(region_page_size));
+
+ err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n",
+ ERR_PTR(err));
+ goto out_free_region_info;
+ }
+
+ err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count);
+ if (err) {
+ dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n",
+ ERR_PTR(err));
+ goto out_free_bitmaps;
+ }
+
+ dirty->region_start = region_start;
+ dirty->region_size = region_size;
+ dirty->region_page_size = region_page_size;
+ pds_vfio_dirty_set_enabled(pds_vfio);
+
+ pds_vfio_print_guest_region_info(pds_vfio, max_regions);
+
+ kfree(region_info);
+
+ return 0;
+
+out_free_bitmaps:
+ pds_vfio_dirty_free_bitmaps(dirty);
+out_free_region_info:
+ kfree(region_info);
+ return err;
+}
+
+void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd)
+{
+ if (pds_vfio_dirty_is_enabled(pds_vfio)) {
+ pds_vfio_dirty_set_disabled(pds_vfio);
+ if (send_cmd)
+ pds_vfio_dirty_disable_cmd(pds_vfio);
+ pds_vfio_dirty_free_sgl(pds_vfio);
+ pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty);
+ }
+
+ if (send_cmd)
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
+}
+
+static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio,
+ struct pds_vfio_bmp_info *bmp_info,
+ u32 offset, u32 bmp_bytes, bool read_seq)
+{
+ const char *bmp_type_str = read_seq ? "read_seq" : "write_ack";
+ u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev;
+ struct device *pdsc_dev = &pci_physfn(pdev)->dev;
+ unsigned long long npages;
+ struct sg_table sg_table;
+ struct scatterlist *sg;
+ struct page **pages;
+ u32 page_offset;
+ const void *bmp;
+ size_t size;
+ u16 num_sge;
+ int err;
+ int i;
+
+ bmp = (void *)((u64)bmp_info->bmp + offset);
+ page_offset = offset_in_page(bmp);
+ bmp -= page_offset;
+
+ /*
+ * Start and end of bitmap section to seq/ack might not be page
+ * aligned, so use the page_offset to account for that so there
+ * will be enough pages to represent the bmp_bytes
+ */
+ npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE);
+ pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ for (unsigned long long i = 0; i < npages; i++) {
+ struct page *page = vmalloc_to_page(bmp);
+
+ if (!page) {
+ err = -EFAULT;
+ goto out_free_pages;
+ }
+
+ pages[i] = page;
+ bmp += PAGE_SIZE;
+ }
+
+ err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset,
+ bmp_bytes, GFP_KERNEL);
+ if (err)
+ goto out_free_pages;
+
+ err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
+ if (err)
+ goto out_free_sg_table;
+
+ for_each_sgtable_dma_sg(&sg_table, sg, i) {
+ struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i];
+
+ sg_elem->addr = cpu_to_le64(sg_dma_address(sg));
+ sg_elem->len = cpu_to_le32(sg_dma_len(sg));
+ }
+
+ num_sge = sg_table.nents;
+ size = num_sge * sizeof(struct pds_lm_sg_elem);
+ dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
+ err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge,
+ offset, bmp_bytes, read_seq);
+ if (err)
+ dev_err(&pdev->dev,
+ "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n",
+ bmp_type_str, offset, bmp_bytes,
+ num_sge, bmp_info->sgl_addr, ERR_PTR(err));
+ dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir);
+
+ dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0);
+out_free_sg_table:
+ sg_free_table(&sg_table);
+out_free_pages:
+ kfree(pages);
+
+ return err;
+}
+
+static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio,
+ u32 offset, u32 len)
+{
+ return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack,
+ offset, len, WRITE_ACK);
+}
+
+static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio,
+ u32 offset, u32 len)
+{
+ return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq,
+ offset, len, READ_SEQ);
+}
+
+static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio,
+ struct iova_bitmap *dirty_bitmap,
+ u32 bmp_offset, u32 len_bytes)
+{
+ u64 page_size = pds_vfio->dirty.region_page_size;
+ u64 region_start = pds_vfio->dirty.region_start;
+ u32 bmp_offset_bit;
+ __le64 *seq, *ack;
+ int dword_count;
+
+ dword_count = len_bytes / sizeof(u64);
+ seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset);
+ ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset);
+ bmp_offset_bit = bmp_offset * 8;
+
+ for (int i = 0; i < dword_count; i++) {
+ u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]);
+
+ /* prepare for next write_ack call */
+ ack[i] = seq[i];
+
+ for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) {
+ if (xor & BIT(bit_i)) {
+ u64 abs_bit_i = bmp_offset_bit +
+ i * BITS_PER_TYPE(u64) + bit_i;
+ u64 addr = abs_bit_i * page_size + region_start;
+
+ iova_bitmap_set(dirty_bitmap, addr, page_size);
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio,
+ struct iova_bitmap *dirty_bitmap,
+ unsigned long iova, unsigned long length)
+{
+ struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+ struct pds_vfio_dirty *dirty = &pds_vfio->dirty;
+ u64 bmp_offset, bmp_bytes;
+ u64 bitmap_size, pages;
+ int err;
+
+ dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id);
+
+ if (!pds_vfio_dirty_is_enabled(pds_vfio)) {
+ dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n",
+ pds_vfio->vf_id);
+ return -EINVAL;
+ }
+
+ pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size);
+ bitmap_size =
+ round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE;
+
+ dev_dbg(dev,
+ "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n",
+ pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size,
+ pages, bitmap_size);
+
+ if (!length || ((dirty->region_start + iova + length) >
+ (dirty->region_start + dirty->region_size))) {
+ dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
+ iova, length);
+ return -EINVAL;
+ }
+
+ /* bitmap is modified in 64 bit chunks */
+ bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size,
+ sizeof(u64)),
+ sizeof(u64));
+ if (bmp_bytes != bitmap_size) {
+ dev_err(dev,
+ "Calculated bitmap bytes %llu not equal to bitmap size %llu\n",
+ bmp_bytes, bitmap_size);
+ return -EINVAL;
+ }
+
+ bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size, sizeof(u64));
+
+ dev_dbg(dev,
+ "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n",
+ iova, length, bmp_offset, bmp_bytes);
+
+ err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes);
+ if (err)
+ return err;
+
+ err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset,
+ bmp_bytes);
+ if (err)
+ return err;
+
+ err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
+ unsigned long length, struct iova_bitmap *dirty)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ int err;
+
+ mutex_lock(&pds_vfio->state_mutex);
+ err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+
+ return err;
+}
+
+int pds_vfio_dma_logging_start(struct vfio_device *vdev,
+ struct rb_root_cached *ranges, u32 nnodes,
+ u64 *page_size)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ int err;
+
+ mutex_lock(&pds_vfio->state_mutex);
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS);
+ err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+
+ return err;
+}
+
+int pds_vfio_dma_logging_stop(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+
+ mutex_lock(&pds_vfio->state_mutex);
+ pds_vfio_dirty_disable(pds_vfio, true);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+
+ return 0;
+}
diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h
new file mode 100644
index 000000000000..f78da25d75ca
--- /dev/null
+++ b/drivers/vfio/pci/pds/dirty.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _DIRTY_H_
+#define _DIRTY_H_
+
+struct pds_vfio_bmp_info {
+ unsigned long *bmp;
+ u32 bmp_bytes;
+ struct pds_lm_sg_elem *sgl;
+ dma_addr_t sgl_addr;
+ u16 num_sge;
+};
+
+struct pds_vfio_dirty {
+ struct pds_vfio_bmp_info host_seq;
+ struct pds_vfio_bmp_info host_ack;
+ u64 region_size;
+ u64 region_start;
+ u64 region_page_size;
+ bool is_enabled;
+};
+
+struct pds_vfio_pci_device;
+
+bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio,
+ bool send_cmd);
+
+int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova,
+ unsigned long length,
+ struct iova_bitmap *dirty);
+int pds_vfio_dma_logging_start(struct vfio_device *vdev,
+ struct rb_root_cached *ranges, u32 nnodes,
+ u64 *page_size);
+int pds_vfio_dma_logging_stop(struct vfio_device *vdev);
+#endif /* _DIRTY_H_ */
diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c
new file mode 100644
index 000000000000..79fe2e66bb49
--- /dev/null
+++ b/drivers/vfio/pci/pds/lm.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include "vfio_dev.h"
+#include "cmds.h"
+
+static struct pds_vfio_lm_file *
+pds_vfio_get_lm_file(const struct file_operations *fops, int flags, u64 size)
+{
+ struct pds_vfio_lm_file *lm_file = NULL;
+ unsigned long long npages;
+ struct page **pages;
+ void *page_mem;
+ const void *p;
+
+ if (!size)
+ return NULL;
+
+ /* Alloc file structure */
+ lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL);
+ if (!lm_file)
+ return NULL;
+
+ /* Create file */
+ lm_file->filep =
+ anon_inode_getfile("pds_vfio_lm", fops, lm_file, flags);
+ if (IS_ERR(lm_file->filep))
+ goto out_free_file;
+
+ stream_open(lm_file->filep->f_inode, lm_file->filep);
+ mutex_init(&lm_file->lock);
+
+ /* prevent file from being released before we are done with it */
+ get_file(lm_file->filep);
+
+ /* Allocate memory for file pages */
+ npages = DIV_ROUND_UP_ULL(size, PAGE_SIZE);
+ pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ goto out_put_file;
+
+ page_mem = kvzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
+ if (!page_mem)
+ goto out_free_pages_array;
+
+ p = page_mem - offset_in_page(page_mem);
+ for (unsigned long long i = 0; i < npages; i++) {
+ if (is_vmalloc_addr(p))
+ pages[i] = vmalloc_to_page(p);
+ else
+ pages[i] = kmap_to_page((void *)p);
+ if (!pages[i])
+ goto out_free_page_mem;
+
+ p += PAGE_SIZE;
+ }
+
+ /* Create scatterlist of file pages to use for DMA mapping later */
+ if (sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages, 0,
+ size, GFP_KERNEL))
+ goto out_free_page_mem;
+
+ lm_file->size = size;
+ lm_file->pages = pages;
+ lm_file->npages = npages;
+ lm_file->page_mem = page_mem;
+ lm_file->alloc_size = npages * PAGE_SIZE;
+
+ return lm_file;
+
+out_free_page_mem:
+ kvfree(page_mem);
+out_free_pages_array:
+ kfree(pages);
+out_put_file:
+ fput(lm_file->filep);
+ mutex_destroy(&lm_file->lock);
+out_free_file:
+ kfree(lm_file);
+
+ return NULL;
+}
+
+static void pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file)
+{
+ mutex_lock(&lm_file->lock);
+
+ lm_file->size = 0;
+ lm_file->alloc_size = 0;
+
+ /* Free scatter list of file pages */
+ sg_free_table(&lm_file->sg_table);
+
+ kvfree(lm_file->page_mem);
+ lm_file->page_mem = NULL;
+ kfree(lm_file->pages);
+ lm_file->pages = NULL;
+
+ mutex_unlock(&lm_file->lock);
+
+ /* allow file to be released since we are done with it */
+ fput(lm_file->filep);
+}
+
+void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (!pds_vfio->save_file)
+ return;
+
+ pds_vfio_put_lm_file(pds_vfio->save_file);
+ pds_vfio->save_file = NULL;
+}
+
+void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (!pds_vfio->restore_file)
+ return;
+
+ pds_vfio_put_lm_file(pds_vfio->restore_file);
+ pds_vfio->restore_file = NULL;
+}
+
+static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file,
+ unsigned long offset)
+{
+ unsigned long cur_offset = 0;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ /* All accesses are sequential */
+ if (offset < lm_file->last_offset || !lm_file->last_offset_sg) {
+ lm_file->last_offset = 0;
+ lm_file->last_offset_sg = lm_file->sg_table.sgl;
+ lm_file->sg_last_entry = 0;
+ }
+
+ cur_offset = lm_file->last_offset;
+
+ for_each_sg(lm_file->last_offset_sg, sg,
+ lm_file->sg_table.orig_nents - lm_file->sg_last_entry, i) {
+ if (offset < sg->length + cur_offset) {
+ lm_file->last_offset_sg = sg;
+ lm_file->sg_last_entry += i;
+ lm_file->last_offset = cur_offset;
+ return nth_page(sg_page(sg),
+ (offset - cur_offset) / PAGE_SIZE);
+ }
+ cur_offset += sg->length;
+ }
+
+ return NULL;
+}
+
+static int pds_vfio_release_file(struct inode *inode, struct file *filp)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+
+ mutex_lock(&lm_file->lock);
+ lm_file->filep->f_pos = 0;
+ lm_file->size = 0;
+ mutex_unlock(&lm_file->lock);
+ mutex_destroy(&lm_file->lock);
+ kfree(lm_file);
+
+ return 0;
+}
+
+static ssize_t pds_vfio_save_read(struct file *filp, char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+ pos = &filp->f_pos;
+
+ mutex_lock(&lm_file->lock);
+ if (*pos > lm_file->size) {
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ len = min_t(size_t, lm_file->size - *pos, len);
+ while (len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *from_buff;
+ int err;
+
+ page_offset = (*pos) % PAGE_SIZE;
+ page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+ if (!page) {
+ if (done == 0)
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+ from_buff = kmap_local_page(page);
+ err = copy_to_user(buf, from_buff + page_offset, page_len);
+ kunmap_local(from_buff);
+ if (err) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += page_len;
+ len -= page_len;
+ done += page_len;
+ buf += page_len;
+ }
+
+out_unlock:
+ mutex_unlock(&lm_file->lock);
+ return done;
+}
+
+static const struct file_operations pds_vfio_save_fops = {
+ .owner = THIS_MODULE,
+ .read = pds_vfio_save_read,
+ .release = pds_vfio_release_file,
+ .llseek = no_llseek,
+};
+
+static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+ struct pds_vfio_lm_file *lm_file;
+ u64 size;
+ int err;
+
+ /* Get live migration state size in this state */
+ err = pds_vfio_get_lm_state_size_cmd(pds_vfio, &size);
+ if (err) {
+ dev_err(dev, "failed to get save status: %pe\n", ERR_PTR(err));
+ return err;
+ }
+
+ dev_dbg(dev, "save status, size = %lld\n", size);
+
+ if (!size) {
+ dev_err(dev, "invalid state size\n");
+ return -EIO;
+ }
+
+ lm_file = pds_vfio_get_lm_file(&pds_vfio_save_fops, O_RDONLY, size);
+ if (!lm_file) {
+ dev_err(dev, "failed to create save file\n");
+ return -ENOENT;
+ }
+
+ dev_dbg(dev, "size = %lld, alloc_size = %lld, npages = %lld\n",
+ lm_file->size, lm_file->alloc_size, lm_file->npages);
+
+ pds_vfio->save_file = lm_file;
+
+ return 0;
+}
+
+static ssize_t pds_vfio_restore_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct pds_vfio_lm_file *lm_file = filp->private_data;
+ loff_t requested_length;
+ ssize_t done = 0;
+
+ if (pos)
+ return -ESPIPE;
+
+ pos = &filp->f_pos;
+
+ if (*pos < 0 ||
+ check_add_overflow((loff_t)len, *pos, &requested_length))
+ return -EINVAL;
+
+ mutex_lock(&lm_file->lock);
+
+ while (len) {
+ size_t page_offset;
+ struct page *page;
+ size_t page_len;
+ u8 *to_buff;
+ int err;
+
+ page_offset = (*pos) % PAGE_SIZE;
+ page = pds_vfio_get_file_page(lm_file, *pos - page_offset);
+ if (!page) {
+ if (done == 0)
+ done = -EINVAL;
+ goto out_unlock;
+ }
+
+ page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
+ to_buff = kmap_local_page(page);
+ err = copy_from_user(to_buff + page_offset, buf, page_len);
+ kunmap_local(to_buff);
+ if (err) {
+ done = -EFAULT;
+ goto out_unlock;
+ }
+ *pos += page_len;
+ len -= page_len;
+ done += page_len;
+ buf += page_len;
+ lm_file->size += page_len;
+ }
+out_unlock:
+ mutex_unlock(&lm_file->lock);
+ return done;
+}
+
+static const struct file_operations pds_vfio_restore_fops = {
+ .owner = THIS_MODULE,
+ .write = pds_vfio_restore_write,
+ .release = pds_vfio_release_file,
+ .llseek = no_llseek,
+};
+
+static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct device *dev = &pds_vfio->vfio_coredev.pdev->dev;
+ struct pds_vfio_lm_file *lm_file;
+ u64 size;
+
+ size = sizeof(union pds_lm_dev_state);
+ dev_dbg(dev, "restore status, size = %lld\n", size);
+
+ if (!size) {
+ dev_err(dev, "invalid state size");
+ return -EIO;
+ }
+
+ lm_file = pds_vfio_get_lm_file(&pds_vfio_restore_fops, O_WRONLY, size);
+ if (!lm_file) {
+ dev_err(dev, "failed to create restore file");
+ return -ENOENT;
+ }
+ pds_vfio->restore_file = lm_file;
+
+ return 0;
+}
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state next)
+{
+ enum vfio_device_mig_state cur = pds_vfio->state;
+ int err;
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) {
+ err = pds_vfio_get_save_file(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ err = pds_vfio_get_lm_state_cmd(pds_vfio);
+ if (err) {
+ pds_vfio_put_save_file(pds_vfio);
+ return ERR_PTR(err);
+ }
+
+ return pds_vfio->save_file->filep;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) {
+ pds_vfio_put_save_file(pds_vfio);
+ pds_vfio_dirty_disable(pds_vfio, true);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) {
+ err = pds_vfio_get_restore_file(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ return pds_vfio->restore_file->filep;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) {
+ err = pds_vfio_set_lm_state_cmd(pds_vfio);
+ if (err)
+ return ERR_PTR(err);
+
+ pds_vfio_put_restore_file(pds_vfio);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio,
+ PDS_LM_STA_IN_PROGRESS);
+ err = pds_vfio_suspend_device_cmd(pds_vfio,
+ PDS_LM_SUSPEND_RESUME_TYPE_P2P);
+ if (err)
+ return ERR_PTR(err);
+
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) {
+ err = pds_vfio_resume_device_cmd(pds_vfio,
+ PDS_LM_SUSPEND_RESUME_TYPE_FULL);
+ if (err)
+ return ERR_PTR(err);
+
+ pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE);
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P) {
+ err = pds_vfio_resume_device_cmd(pds_vfio,
+ PDS_LM_SUSPEND_RESUME_TYPE_P2P);
+ if (err)
+ return ERR_PTR(err);
+
+ return NULL;
+ }
+
+ if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP) {
+ err = pds_vfio_suspend_device_cmd(pds_vfio,
+ PDS_LM_SUSPEND_RESUME_TYPE_FULL);
+ if (err)
+ return ERR_PTR(err);
+ return NULL;
+ }
+
+ return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h
new file mode 100644
index 000000000000..13be893198b7
--- /dev/null
+++ b/drivers/vfio/pci/pds/lm.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _LM_H_
+#define _LM_H_
+
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/types.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_adminq.h>
+
+struct pds_vfio_lm_file {
+ struct file *filep;
+ struct mutex lock; /* protect live migration data file */
+ u64 size; /* Size with valid data */
+ u64 alloc_size; /* Total allocated size. Always >= len */
+ void *page_mem; /* memory allocated for pages */
+ struct page **pages; /* Backing pages for file */
+ unsigned long long npages;
+ struct sg_table sg_table; /* SG table for backing pages */
+ struct pds_lm_sg_elem *sgl; /* DMA mapping */
+ dma_addr_t sgl_addr;
+ u16 num_sge;
+ struct scatterlist *last_offset_sg; /* Iterator */
+ unsigned int sg_last_entry;
+ unsigned long last_offset;
+};
+
+struct pds_vfio_pci_device;
+
+struct file *
+pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio,
+ enum vfio_device_mig_state next);
+
+void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio);
+void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio);
+
+#endif /* _LM_H_ */
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
new file mode 100644
index 000000000000..ab4b5958e413
--- /dev/null
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
+
+#include "vfio_dev.h"
+#include "pci_drv.h"
+#include "cmds.h"
+
+#define PDS_VFIO_DRV_DESCRIPTION "AMD/Pensando VFIO Device Driver"
+#define PCI_VENDOR_ID_PENSANDO 0x1dd8
+
+static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio)
+{
+ bool deferred_reset_needed = false;
+
+ /*
+ * Documentation states that the kernel migration driver must not
+ * generate asynchronous device state transitions outside of
+ * manipulation by the user or the VFIO_DEVICE_RESET ioctl.
+ *
+ * Since recovery is an asynchronous event received from the device,
+ * initiate a deferred reset. Issue a deferred reset in the following
+ * situations:
+ * 1. Migration is in progress, which will cause the next step of
+ * the migration to fail.
+ * 2. If the device is in a state that will be set to
+ * VFIO_DEVICE_STATE_RUNNING on the next action (i.e. VM is
+ * shutdown and device is in VFIO_DEVICE_STATE_STOP).
+ */
+ mutex_lock(&pds_vfio->state_mutex);
+ if ((pds_vfio->state != VFIO_DEVICE_STATE_RUNNING &&
+ pds_vfio->state != VFIO_DEVICE_STATE_ERROR) ||
+ (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING &&
+ pds_vfio_dirty_is_enabled(pds_vfio)))
+ deferred_reset_needed = true;
+ mutex_unlock(&pds_vfio->state_mutex);
+
+ /*
+ * On the next user initiated state transition, the device will
+ * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's
+ * responsibility to reset the device.
+ *
+ * If a VFIO_DEVICE_RESET is requested post recovery and before the next
+ * state transition, then the deferred reset state will be set to
+ * VFIO_DEVICE_STATE_RUNNING.
+ */
+ if (deferred_reset_needed) {
+ spin_lock(&pds_vfio->reset_lock);
+ pds_vfio->deferred_reset = true;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR;
+ spin_unlock(&pds_vfio->reset_lock);
+ }
+}
+
+static int pds_vfio_pci_notify_handler(struct notifier_block *nb,
+ unsigned long ecode, void *data)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(nb, struct pds_vfio_pci_device, nb);
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ union pds_core_notifyq_comp *event = data;
+
+ dev_dbg(dev, "%s: event code %lu\n", __func__, ecode);
+
+ /*
+ * We don't need to do anything for RESET state==0 as there is no notify
+ * or feedback mechanism available, and it is possible that we won't
+ * even see a state==0 event since the pds_core recovery is pending.
+ *
+ * Any requests from VFIO while state==0 will fail, which will return
+ * error and may cause migration to fail.
+ */
+ if (ecode == PDS_EVENT_RESET) {
+ dev_info(dev, "%s: PDS_EVENT_RESET event received, state==%d\n",
+ __func__, event->reset.state);
+ /*
+ * pds_core device finished recovery and sent us the
+ * notification (state == 1) to allow us to recover
+ */
+ if (event->reset.state == 1)
+ pds_vfio_recovery(pds_vfio);
+ }
+
+ return 0;
+}
+
+static int
+pds_vfio_pci_register_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+ struct device *dev = pds_vfio_to_dev(pds_vfio);
+ struct notifier_block *nb = &pds_vfio->nb;
+ int err;
+
+ if (!nb->notifier_call) {
+ nb->notifier_call = pds_vfio_pci_notify_handler;
+ err = pdsc_register_notify(nb);
+ if (err) {
+ nb->notifier_call = NULL;
+ dev_err(dev,
+ "failed to register pds event handler: %pe\n",
+ ERR_PTR(err));
+ return -EINVAL;
+ }
+ dev_dbg(dev, "pds event handler registered\n");
+ }
+
+ return 0;
+}
+
+static void
+pds_vfio_pci_unregister_event_handler(struct pds_vfio_pci_device *pds_vfio)
+{
+ if (pds_vfio->nb.notifier_call) {
+ pdsc_unregister_notify(&pds_vfio->nb);
+ pds_vfio->nb.notifier_call = NULL;
+ }
+}
+
+static int pds_vfio_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *id)
+{
+ struct pds_vfio_pci_device *pds_vfio;
+ int err;
+
+ pds_vfio = vfio_alloc_device(pds_vfio_pci_device, vfio_coredev.vdev,
+ &pdev->dev, pds_vfio_ops_info());
+ if (IS_ERR(pds_vfio))
+ return PTR_ERR(pds_vfio);
+
+ dev_set_drvdata(&pdev->dev, &pds_vfio->vfio_coredev);
+
+ err = vfio_pci_core_register_device(&pds_vfio->vfio_coredev);
+ if (err)
+ goto out_put_vdev;
+
+ err = pds_vfio_register_client_cmd(pds_vfio);
+ if (err) {
+ dev_err(&pdev->dev, "failed to register as client: %pe\n",
+ ERR_PTR(err));
+ goto out_unregister_coredev;
+ }
+
+ err = pds_vfio_pci_register_event_handler(pds_vfio);
+ if (err)
+ goto out_unregister_client;
+
+ return 0;
+
+out_unregister_client:
+ pds_vfio_unregister_client_cmd(pds_vfio);
+out_unregister_coredev:
+ vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
+out_put_vdev:
+ vfio_put_device(&pds_vfio->vfio_coredev.vdev);
+ return err;
+}
+
+static void pds_vfio_pci_remove(struct pci_dev *pdev)
+{
+ struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+
+ pds_vfio_pci_unregister_event_handler(pds_vfio);
+ pds_vfio_unregister_client_cmd(pds_vfio);
+ vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev);
+ vfio_put_device(&pds_vfio->vfio_coredev.vdev);
+}
+
+static const struct pci_device_id pds_vfio_pci_table[] = {
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_PENSANDO, 0x1003) }, /* Ethernet VF */
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table);
+
+static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev)
+{
+ struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev);
+
+ pds_vfio_reset(pds_vfio);
+}
+
+static const struct pci_error_handlers pds_vfio_pci_err_handlers = {
+ .reset_done = pds_vfio_pci_aer_reset_done,
+ .error_detected = vfio_pci_core_aer_err_detected,
+};
+
+static struct pci_driver pds_vfio_pci_driver = {
+ .name = KBUILD_MODNAME,
+ .id_table = pds_vfio_pci_table,
+ .probe = pds_vfio_pci_probe,
+ .remove = pds_vfio_pci_remove,
+ .err_handler = &pds_vfio_pci_err_handlers,
+ .driver_managed_dma = true,
+};
+
+module_pci_driver(pds_vfio_pci_driver);
+
+MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
+MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/vfio/pci/pds/pci_drv.h b/drivers/vfio/pci/pds/pci_drv.h
new file mode 100644
index 000000000000..e79bed12ed14
--- /dev/null
+++ b/drivers/vfio/pci/pds/pci_drv.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PCI_DRV_H
+#define _PCI_DRV_H
+
+#include <linux/pci.h>
+
+#endif /* _PCI_DRV_H */
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
new file mode 100644
index 000000000000..b46174f5eb09
--- /dev/null
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#include <linux/vfio.h>
+#include <linux/vfio_pci_core.h>
+
+#include "lm.h"
+#include "dirty.h"
+#include "vfio_dev.h"
+
+struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio)
+{
+ return pds_vfio->vfio_coredev.pdev;
+}
+
+struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio)
+{
+ return &pds_vfio_to_pci_dev(pds_vfio)->dev;
+}
+
+struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev)
+{
+ struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+ return container_of(core_device, struct pds_vfio_pci_device,
+ vfio_coredev);
+}
+
+void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio)
+{
+again:
+ spin_lock(&pds_vfio->reset_lock);
+ if (pds_vfio->deferred_reset) {
+ pds_vfio->deferred_reset = false;
+ if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) {
+ pds_vfio_put_restore_file(pds_vfio);
+ pds_vfio_put_save_file(pds_vfio);
+ pds_vfio_dirty_disable(pds_vfio, false);
+ }
+ pds_vfio->state = pds_vfio->deferred_reset_state;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+ spin_unlock(&pds_vfio->reset_lock);
+ goto again;
+ }
+ mutex_unlock(&pds_vfio->state_mutex);
+ spin_unlock(&pds_vfio->reset_lock);
+}
+
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio)
+{
+ spin_lock(&pds_vfio->reset_lock);
+ pds_vfio->deferred_reset = true;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+ if (!mutex_trylock(&pds_vfio->state_mutex)) {
+ spin_unlock(&pds_vfio->reset_lock);
+ return;
+ }
+ spin_unlock(&pds_vfio->reset_lock);
+ pds_vfio_state_mutex_unlock(pds_vfio);
+}
+
+static struct file *
+pds_vfio_set_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state new_state)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ struct file *res = NULL;
+
+ mutex_lock(&pds_vfio->state_mutex);
+ /*
+ * only way to transition out of VFIO_DEVICE_STATE_ERROR is via
+ * VFIO_DEVICE_RESET, so prevent the state machine from running since
+ * vfio_mig_get_next_state() will throw a WARN_ON() when transitioning
+ * from VFIO_DEVICE_STATE_ERROR to any other state
+ */
+ while (pds_vfio->state != VFIO_DEVICE_STATE_ERROR &&
+ new_state != pds_vfio->state) {
+ enum vfio_device_mig_state next_state;
+
+ int err = vfio_mig_get_next_state(vdev, pds_vfio->state,
+ new_state, &next_state);
+ if (err) {
+ res = ERR_PTR(err);
+ break;
+ }
+
+ res = pds_vfio_step_device_state_locked(pds_vfio, next_state);
+ if (IS_ERR(res))
+ break;
+
+ pds_vfio->state = next_state;
+
+ if (WARN_ON(res && new_state != pds_vfio->state)) {
+ res = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ pds_vfio_state_mutex_unlock(pds_vfio);
+ /* still waiting on a deferred_reset */
+ if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR)
+ res = ERR_PTR(-EIO);
+
+ return res;
+}
+
+static int pds_vfio_get_device_state(struct vfio_device *vdev,
+ enum vfio_device_mig_state *current_state)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+
+ mutex_lock(&pds_vfio->state_mutex);
+ *current_state = pds_vfio->state;
+ pds_vfio_state_mutex_unlock(pds_vfio);
+ return 0;
+}
+
+static int pds_vfio_get_device_state_size(struct vfio_device *vdev,
+ unsigned long *stop_copy_length)
+{
+ *stop_copy_length = PDS_LM_DEVICE_STATE_LENGTH;
+ return 0;
+}
+
+static const struct vfio_migration_ops pds_vfio_lm_ops = {
+ .migration_set_state = pds_vfio_set_device_state,
+ .migration_get_state = pds_vfio_get_device_state,
+ .migration_get_data_size = pds_vfio_get_device_state_size
+};
+
+static const struct vfio_log_ops pds_vfio_log_ops = {
+ .log_start = pds_vfio_dma_logging_start,
+ .log_stop = pds_vfio_dma_logging_stop,
+ .log_read_and_clear = pds_vfio_dma_logging_report,
+};
+
+static int pds_vfio_init_device(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ struct pci_dev *pdev = to_pci_dev(vdev->dev);
+ int err, vf_id, pci_id;
+
+ vf_id = pci_iov_vf_id(pdev);
+ if (vf_id < 0)
+ return vf_id;
+
+ err = vfio_pci_core_init_dev(vdev);
+ if (err)
+ return err;
+
+ pds_vfio->vf_id = vf_id;
+
+ vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P;
+ vdev->mig_ops = &pds_vfio_lm_ops;
+ vdev->log_ops = &pds_vfio_log_ops;
+
+ pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn);
+ dev_dbg(&pdev->dev,
+ "%s: PF %#04x VF %#04x vf_id %d domain %d pds_vfio %p\n",
+ __func__, pci_dev_id(pdev->physfn), pci_id, vf_id,
+ pci_domain_nr(pdev->bus), pds_vfio);
+
+ return 0;
+}
+
+static int pds_vfio_open_device(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+ int err;
+
+ err = vfio_pci_core_enable(&pds_vfio->vfio_coredev);
+ if (err)
+ return err;
+
+ mutex_init(&pds_vfio->state_mutex);
+ pds_vfio->state = VFIO_DEVICE_STATE_RUNNING;
+ pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING;
+
+ vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev);
+
+ return 0;
+}
+
+static void pds_vfio_close_device(struct vfio_device *vdev)
+{
+ struct pds_vfio_pci_device *pds_vfio =
+ container_of(vdev, struct pds_vfio_pci_device,
+ vfio_coredev.vdev);
+
+ mutex_lock(&pds_vfio->state_mutex);
+ pds_vfio_put_restore_file(pds_vfio);
+ pds_vfio_put_save_file(pds_vfio);
+ pds_vfio_dirty_disable(pds_vfio, true);
+ mutex_unlock(&pds_vfio->state_mutex);
+ mutex_destroy(&pds_vfio->state_mutex);
+ vfio_pci_core_close_device(vdev);
+}
+
+static const struct vfio_device_ops pds_vfio_ops = {
+ .name = "pds-vfio",
+ .init = pds_vfio_init_device,
+ .release = vfio_pci_core_release_dev,
+ .open_device = pds_vfio_open_device,
+ .close_device = pds_vfio_close_device,
+ .ioctl = vfio_pci_core_ioctl,
+ .device_feature = vfio_pci_core_ioctl_feature,
+ .read = vfio_pci_core_read,
+ .write = vfio_pci_core_write,
+ .mmap = vfio_pci_core_mmap,
+ .request = vfio_pci_core_request,
+ .match = vfio_pci_core_match,
+ .bind_iommufd = vfio_iommufd_physical_bind,
+ .unbind_iommufd = vfio_iommufd_physical_unbind,
+ .attach_ioas = vfio_iommufd_physical_attach_ioas,
+};
+
+const struct vfio_device_ops *pds_vfio_ops_info(void)
+{
+ return &pds_vfio_ops;
+}
diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h
new file mode 100644
index 000000000000..b8f2d667608f
--- /dev/null
+++ b/drivers/vfio/pci/pds/vfio_dev.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _VFIO_DEV_H_
+#define _VFIO_DEV_H_
+
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
+
+#include "dirty.h"
+#include "lm.h"
+
+struct pds_vfio_pci_device {
+ struct vfio_pci_core_device vfio_coredev;
+
+ struct pds_vfio_lm_file *save_file;
+ struct pds_vfio_lm_file *restore_file;
+ struct pds_vfio_dirty dirty;
+ struct mutex state_mutex; /* protect migration state */
+ enum vfio_device_mig_state state;
+ spinlock_t reset_lock; /* protect reset_done flow */
+ u8 deferred_reset;
+ enum vfio_device_mig_state deferred_reset_state;
+ struct notifier_block nb;
+
+ int vf_id;
+ u16 client_id;
+};
+
+void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio);
+
+const struct vfio_device_ops *pds_vfio_ops_info(void);
+struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev);
+void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio);
+
+struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio);
+struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio);
+
+#endif /* _VFIO_DEV_H_ */
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 29091ee2e984..cb5b7f865d58 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -141,6 +141,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 20d7b69ea6ff..1929103ee59a 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -27,6 +27,7 @@
#include <linux/vgaarb.h>
#include <linux/nospec.h>
#include <linux/sched/mm.h>
+#include <linux/iommufd.h>
#if IS_ENABLED(CONFIG_EEH)
#include <asm/eeh.h>
#endif
@@ -180,7 +181,8 @@ no_mmap:
struct vfio_pci_group_info;
static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
- struct vfio_pci_group_info *groups);
+ struct vfio_pci_group_info *groups,
+ struct iommufd_ctx *iommufd_ctx);
/*
* INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
@@ -776,29 +778,65 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
}
struct vfio_pci_fill_info {
- int max;
- int cur;
- struct vfio_pci_dependent_device *devices;
+ struct vfio_pci_dependent_device __user *devices;
+ struct vfio_pci_dependent_device __user *devices_end;
+ struct vfio_device *vdev;
+ u32 count;
+ u32 flags;
};
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
{
+ struct vfio_pci_dependent_device info = {
+ .segment = pci_domain_nr(pdev->bus),
+ .bus = pdev->bus->number,
+ .devfn = pdev->devfn,
+ };
struct vfio_pci_fill_info *fill = data;
- struct iommu_group *iommu_group;
- if (fill->cur == fill->max)
- return -EAGAIN; /* Something changed, try again */
+ fill->count++;
+ if (fill->devices >= fill->devices_end)
+ return 0;
+
+ if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
+ struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
+ struct vfio_device_set *dev_set = fill->vdev->dev_set;
+ struct vfio_device *vdev;
+
+ /*
+ * hot-reset requires all affected devices be represented in
+ * the dev_set.
+ */
+ vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
+ if (!vdev) {
+ info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ } else {
+ int id = vfio_iommufd_get_dev_id(vdev, iommufd);
+
+ if (id > 0)
+ info.devid = id;
+ else if (id == -ENOENT)
+ info.devid = VFIO_PCI_DEVID_OWNED;
+ else
+ info.devid = VFIO_PCI_DEVID_NOT_OWNED;
+ }
+ /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
+ if (info.devid == VFIO_PCI_DEVID_NOT_OWNED)
+ fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+ } else {
+ struct iommu_group *iommu_group;
+
+ iommu_group = iommu_group_get(&pdev->dev);
+ if (!iommu_group)
+ return -EPERM; /* Cannot reset non-isolated devices */
- iommu_group = iommu_group_get(&pdev->dev);
- if (!iommu_group)
- return -EPERM; /* Cannot reset non-isolated devices */
+ info.group_id = iommu_group_id(iommu_group);
+ iommu_group_put(iommu_group);
+ }
- fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
- fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
- fill->devices[fill->cur].bus = pdev->bus->number;
- fill->devices[fill->cur].devfn = pdev->devfn;
- fill->cur++;
- iommu_group_put(iommu_group);
+ if (copy_to_user(fill->devices, &info, sizeof(info)))
+ return -EFAULT;
+ fill->devices++;
return 0;
}
@@ -920,24 +958,17 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
struct vfio_device_info __user *arg)
{
unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs);
- struct vfio_device_info info;
+ struct vfio_device_info info = {};
struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
- unsigned long capsz;
int ret;
- /* For backward compatibility, cannot require this */
- capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
-
if (copy_from_user(&info, arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
- if (info.argsz >= capsz) {
- minsz = capsz;
- info.cap_offset = 0;
- }
+ minsz = min_t(size_t, info.argsz, sizeof(info));
info.flags = VFIO_DEVICE_FLAGS_PCI;
@@ -1228,8 +1259,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
unsigned long minsz =
offsetofend(struct vfio_pci_hot_reset_info, count);
struct vfio_pci_hot_reset_info hdr;
- struct vfio_pci_fill_info fill = { 0 };
- struct vfio_pci_dependent_device *devices = NULL;
+ struct vfio_pci_fill_info fill = {};
bool slot = false;
int ret = 0;
@@ -1247,78 +1277,42 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
else if (pci_probe_reset_bus(vdev->pdev->bus))
return -ENODEV;
- /* How many devices are affected? */
- ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
- &fill.max, slot);
- if (ret)
- return ret;
+ fill.devices = arg->devices;
+ fill.devices_end = arg->devices +
+ (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]);
+ fill.vdev = &vdev->vdev;
- WARN_ON(!fill.max); /* Should always be at least one */
-
- /*
- * If there's enough space, fill it now, otherwise return -ENOSPC and
- * the number of devices affected.
- */
- if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
- ret = -ENOSPC;
- hdr.count = fill.max;
- goto reset_info_exit;
- }
-
- devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
- if (!devices)
- return -ENOMEM;
-
- fill.devices = devices;
+ if (vfio_device_cdev_opened(&vdev->vdev))
+ fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID |
+ VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
+ mutex_lock(&vdev->vdev.dev_set->lock);
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
&fill, slot);
+ mutex_unlock(&vdev->vdev.dev_set->lock);
+ if (ret)
+ return ret;
- /*
- * If a device was removed between counting and filling, we may come up
- * short of fill.max. If a device was added, we'll have a return of
- * -EAGAIN above.
- */
- if (!ret)
- hdr.count = fill.cur;
-
-reset_info_exit:
+ hdr.count = fill.count;
+ hdr.flags = fill.flags;
if (copy_to_user(arg, &hdr, minsz))
- ret = -EFAULT;
-
- if (!ret) {
- if (copy_to_user(&arg->devices, devices,
- hdr.count * sizeof(*devices)))
- ret = -EFAULT;
- }
+ return -EFAULT;
- kfree(devices);
- return ret;
+ if (fill.count > fill.devices - arg->devices)
+ return -ENOSPC;
+ return 0;
}
-static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
- struct vfio_pci_hot_reset __user *arg)
+static int
+vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
+ int array_count, bool slot,
+ struct vfio_pci_hot_reset __user *arg)
{
- unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
- struct vfio_pci_hot_reset hdr;
int32_t *group_fds;
struct file **files;
struct vfio_pci_group_info info;
- bool slot = false;
int file_idx, count = 0, ret = 0;
- if (copy_from_user(&hdr, arg, minsz))
- return -EFAULT;
-
- if (hdr.argsz < minsz || hdr.flags)
- return -EINVAL;
-
- /* Can we do a slot or bus reset or neither? */
- if (!pci_probe_reset_slot(vdev->pdev->slot))
- slot = true;
- else if (pci_probe_reset_bus(vdev->pdev->bus))
- return -ENODEV;
-
/*
* We can't let userspace give us an arbitrarily large buffer to copy,
* so verify how many we think there could be. Note groups can have
@@ -1329,12 +1323,11 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
if (ret)
return ret;
- /* Somewhere between 1 and count is OK */
- if (!hdr.count || hdr.count > count)
+ if (array_count > count)
return -EINVAL;
- group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
- files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL);
+ group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
+ files = kcalloc(array_count, sizeof(*files), GFP_KERNEL);
if (!group_fds || !files) {
kfree(group_fds);
kfree(files);
@@ -1342,18 +1335,17 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
}
if (copy_from_user(group_fds, arg->group_fds,
- hdr.count * sizeof(*group_fds))) {
+ array_count * sizeof(*group_fds))) {
kfree(group_fds);
kfree(files);
return -EFAULT;
}
/*
- * For each group_fd, get the group through the vfio external user
- * interface and store the group and iommu ID. This ensures the group
- * is held across the reset.
+ * Get the group file for each fd to ensure the group is held across
+ * the reset
*/
- for (file_idx = 0; file_idx < hdr.count; file_idx++) {
+ for (file_idx = 0; file_idx < array_count; file_idx++) {
struct file *file = fget(group_fds[file_idx]);
if (!file) {
@@ -1377,10 +1369,10 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
if (ret)
goto hot_reset_release;
- info.count = hdr.count;
+ info.count = array_count;
info.files = files;
- ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
+ ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
hot_reset_release:
for (file_idx--; file_idx >= 0; file_idx--)
@@ -1390,6 +1382,36 @@ hot_reset_release:
return ret;
}
+static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
+ struct vfio_pci_hot_reset __user *arg)
+{
+ unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
+ struct vfio_pci_hot_reset hdr;
+ bool slot = false;
+
+ if (copy_from_user(&hdr, arg, minsz))
+ return -EFAULT;
+
+ if (hdr.argsz < minsz || hdr.flags)
+ return -EINVAL;
+
+ /* zero-length array is only for cdev opened devices */
+ if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev))
+ return -EINVAL;
+
+ /* Can we do a slot or bus reset or neither? */
+ if (!pci_probe_reset_slot(vdev->pdev->slot))
+ slot = true;
+ else if (pci_probe_reset_bus(vdev->pdev->bus))
+ return -ENODEV;
+
+ if (hdr.count)
+ return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
+
+ return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
+ vfio_iommufd_device_ictx(&vdev->vdev));
+}
+
static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
struct vfio_device_ioeventfd __user *arg)
{
@@ -2355,13 +2377,16 @@ const struct pci_error_handlers vfio_pci_core_err_handlers = {
};
EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
-static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
+static bool vfio_dev_in_groups(struct vfio_device *vdev,
struct vfio_pci_group_info *groups)
{
unsigned int i;
+ if (!groups)
+ return false;
+
for (i = 0; i < groups->count; i++)
- if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
+ if (vfio_file_has_dev(groups->files[i], vdev))
return true;
return false;
}
@@ -2369,12 +2394,8 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
{
struct vfio_device_set *dev_set = data;
- struct vfio_device *cur;
- list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
- if (cur->dev == &pdev->dev)
- return 0;
- return -EBUSY;
+ return vfio_find_device_in_devset(dev_set, &pdev->dev) ? 0 : -ENODEV;
}
/*
@@ -2441,7 +2462,8 @@ unwind:
* get each memory_lock.
*/
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
- struct vfio_pci_group_info *groups)
+ struct vfio_pci_group_info *groups,
+ struct iommufd_ctx *iommufd_ctx)
{
struct vfio_pci_core_device *cur_mem;
struct vfio_pci_core_device *cur_vma;
@@ -2471,11 +2493,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
goto err_unlock;
list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
+ bool owned;
+
/*
- * Test whether all the affected devices are contained by the
- * set of groups provided by the user.
+ * Test whether all the affected devices can be reset by the
+ * user.
+ *
+ * If called from a group opened device and the user provides
+ * a set of groups, all the devices in the dev_set should be
+ * contained by the set of groups provided by the user.
+ *
+ * If called from a cdev opened device and the user provides
+ * a zero-length array, all the devices in the dev_set must
+ * be bound to the same iommufd_ctx as the input iommufd_ctx.
+ * If there is any device that has not been bound to any
+ * iommufd_ctx yet, check if its iommu_group has any device
+ * bound to the input iommufd_ctx. Such devices can be
+ * considered owned by the input iommufd_ctx as the device
+ * cannot be owned by another iommufd_ctx when its iommu_group
+ * is owned.
+ *
+ * Otherwise, reset is not allowed.
*/
- if (!vfio_dev_in_groups(cur_vma, groups)) {
+ if (iommufd_ctx) {
+ int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev,
+ iommufd_ctx);
+
+ owned = (devid > 0 || devid == -ENOENT);
+ } else {
+ owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
+ }
+
+ if (!owned) {
ret = -EINVAL;
goto err_undo;
}