summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-15 13:12:15 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-15 13:12:15 -0800
commit785d21ba2f447fb26df4b22f45653763beb767ea (patch)
tree6c447d85c3359198921807a1d121499d47c80233 /include
parent8fa590bf344816c925810331eea8387627bbeb40 (diff)
parent70be6f322860d322ebcd120cf0c05402ead5c6de (diff)
Merge tag 'vfio-v6.2-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - Replace deprecated git://github.com link in MAINTAINERS (Palmer Dabbelt) - Simplify vfio/mlx5 with module_pci_driver() helper (Shang XiaoJing) - Drop unnecessary buffer from ACPI call (Rafael Mendonca) - Correct latent missing include issue in iova-bitmap and fix support for unaligned bitmaps. Follow-up with better fix through refactor (Joao Martins) - Rework ccw mdev driver to split private data from parent structure, better aligning with the mdev lifecycle and allowing us to remove a temporary workaround (Eric Farman) - Add an interface to get an estimated migration data size for a device, allowing userspace to make informed decisions, ex. more accurately predicting VM downtime (Yishai Hadas) - Fix minor typo in vfio/mlx5 array declaration (Yishai Hadas) - Simplify module and Kconfig through consolidating SPAPR/EEH code and config options and folding virqfd module into main vfio module (Jason Gunthorpe) - Fix error path from device_register() across all vfio mdev and sample drivers (Alex Williamson) - Define migration pre-copy interface and implement for vfio/mlx5 devices, allowing portions of the device state to be saved while the device continues operation, towards reducing the stop-copy state size (Jason Gunthorpe, Yishai Hadas, Shay Drory) - Implement pre-copy for hisi_acc devices (Shameer Kolothum) - Fixes to mdpy mdev driver remove path and error path on probe (Shang XiaoJing) - vfio/mlx5 fixes for incorrect return after copy_to_user() fault and incorrect buffer freeing (Dan Carpenter) * tag 'vfio-v6.2-rc1' of https://github.com/awilliam/linux-vfio: (42 commits) vfio/mlx5: error pointer dereference in error handling vfio/mlx5: fix error code in mlx5vf_precopy_ioctl() samples: vfio-mdev: Fix missing pci_disable_device() in mdpy_fb_probe() hisi_acc_vfio_pci: Enable PRE_COPY flag hisi_acc_vfio_pci: Move the dev compatibility tests for early check hisi_acc_vfio_pci: Introduce support for PRE_COPY state transitions hisi_acc_vfio_pci: Add support for precopy IOCTL vfio/mlx5: Enable MIGRATION_PRE_COPY flag vfio/mlx5: Fallback to STOP_COPY upon specific PRE_COPY error vfio/mlx5: Introduce multiple loads vfio/mlx5: Consider temporary end of stream as part of PRE_COPY vfio/mlx5: Introduce vfio precopy ioctl implementation vfio/mlx5: Introduce SW headers for migration states vfio/mlx5: Introduce device transitions of PRE_COPY vfio/mlx5: Refactor to use queue based data chunks vfio/mlx5: Refactor migration file state vfio/mlx5: Refactor MKEY usage vfio/mlx5: Refactor PD usage vfio/mlx5: Enforce a single SAVE command at a time vfio: Extend the device migration protocol with PRE_COPY ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/mlx5/mlx5_ifc.h14
-rw-r--r--include/linux/vfio.h31
-rw-r--r--include/uapi/linux/vfio.h136
3 files changed, 148 insertions, 33 deletions
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 152d2d7f8743..f3d1c62c98dd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1891,7 +1891,12 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
u8 max_reformat_remove_size[0x8];
u8 max_reformat_remove_offset[0x8];
- u8 reserved_at_c0[0xe0];
+ u8 reserved_at_c0[0x8];
+ u8 migration_multi_load[0x1];
+ u8 migration_tracking_state[0x1];
+ u8 reserved_at_ca[0x16];
+
+ u8 reserved_at_e0[0xc0];
u8 reserved_at_1a0[0xb];
u8 log_min_mkey_entity_size[0x5];
@@ -12033,7 +12038,8 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits {
u8 reserved_at_20[0x10];
u8 op_mod[0x10];
- u8 reserved_at_40[0x10];
+ u8 incremental[0x1];
+ u8 reserved_at_41[0xf];
u8 vhca_id[0x10];
u8 reserved_at_60[0x20];
@@ -12059,7 +12065,9 @@ struct mlx5_ifc_save_vhca_state_in_bits {
u8 reserved_at_20[0x10];
u8 op_mod[0x10];
- u8 reserved_at_40[0x10];
+ u8 incremental[0x1];
+ u8 set_track[0x1];
+ u8 reserved_at_42[0xe];
u8 vhca_id[0x10];
u8 reserved_at_60[0x20];
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index a615542df1e0..35be78e9ae57 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -146,6 +146,9 @@ int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id);
* @migration_get_state: Optional callback to get the migration state for
* devices that support migration. It's mandatory for
* VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ * @migration_get_data_size: Optional callback to get the estimated data
+ * length that will be required to complete stop copy. It's mandatory for
+ * VFIO_DEVICE_FEATURE_MIGRATION migration support.
*/
struct vfio_migration_ops {
struct file *(*migration_set_state)(
@@ -153,6 +156,8 @@ struct vfio_migration_ops {
enum vfio_device_mig_state new_state);
int (*migration_get_state)(struct vfio_device *device,
enum vfio_device_mig_state *curr_state);
+ int (*migration_get_data_size)(struct vfio_device *device,
+ unsigned long *stop_copy_length);
};
/**
@@ -215,9 +220,6 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
dev, ops), \
struct dev_struct, member)
-int vfio_init_device(struct vfio_device *device, struct device *dev,
- const struct vfio_device_ops *ops);
-void vfio_free_device(struct vfio_device *device);
static inline void vfio_put_device(struct vfio_device *device)
{
put_device(&device->device);
@@ -271,29 +273,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
int num_irqs, int max_irq_type,
size_t *data_size);
-struct pci_dev;
-#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
-void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
-void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
-long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd,
- unsigned long arg);
-#else
-static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
-{
-}
-
-static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
-{
-}
-
-static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
- unsigned int cmd,
- unsigned long arg)
-{
- return -ENOTTY;
-}
-#endif /* CONFIG_VFIO_SPAPR_EEH */
-
/*
* IRQfd - generic
*/
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d7d8e0922376..23105eb036fa 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -819,12 +819,20 @@ struct vfio_device_feature {
* VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P
* is supported in addition to the STOP_COPY states.
*
+ * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY means that
+ * PRE_COPY is supported in addition to the STOP_COPY states.
+ *
+ * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY
+ * means that RUNNING_P2P, PRE_COPY and PRE_COPY_P2P are supported
+ * in addition to the STOP_COPY states.
+ *
* Other combinations of flags have behavior to be defined in the future.
*/
struct vfio_device_feature_migration {
__aligned_u64 flags;
#define VFIO_MIGRATION_STOP_COPY (1 << 0)
#define VFIO_MIGRATION_P2P (1 << 1)
+#define VFIO_MIGRATION_PRE_COPY (1 << 2)
};
#define VFIO_DEVICE_FEATURE_MIGRATION 1
@@ -875,8 +883,13 @@ struct vfio_device_feature_mig_state {
* RESUMING - The device is stopped and is loading a new internal state
* ERROR - The device has failed and must be reset
*
- * And 1 optional state to support VFIO_MIGRATION_P2P:
+ * And optional states to support VFIO_MIGRATION_P2P:
* RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA
+ * And VFIO_MIGRATION_PRE_COPY:
+ * PRE_COPY - The device is running normally but tracking internal state
+ * changes
+ * And VFIO_MIGRATION_P2P | VFIO_MIGRATION_PRE_COPY:
+ * PRE_COPY_P2P - PRE_COPY, except the device cannot do peer to peer DMA
*
* The FSM takes actions on the arcs between FSM states. The driver implements
* the following behavior for the FSM arcs:
@@ -908,20 +921,48 @@ struct vfio_device_feature_mig_state {
*
* To abort a RESUMING session the device must be reset.
*
+ * PRE_COPY -> RUNNING
* RUNNING_P2P -> RUNNING
* While in RUNNING the device is fully operational, the device may generate
* interrupts, DMA, respond to MMIO, all vfio device regions are functional,
* and the device may advance its internal state.
*
+ * The PRE_COPY arc will terminate a data transfer session.
+ *
+ * PRE_COPY_P2P -> RUNNING_P2P
* RUNNING -> RUNNING_P2P
* STOP -> RUNNING_P2P
* While in RUNNING_P2P the device is partially running in the P2P quiescent
* state defined below.
*
+ * The PRE_COPY_P2P arc will terminate a data transfer session.
+ *
+ * RUNNING -> PRE_COPY
+ * RUNNING_P2P -> PRE_COPY_P2P
* STOP -> STOP_COPY
- * This arc begin the process of saving the device state and will return a
- * new data_fd.
+ * PRE_COPY, PRE_COPY_P2P and STOP_COPY form the "saving group" of states
+ * which share a data transfer session. Moving between these states alters
+ * what is streamed in session, but does not terminate or otherwise affect
+ * the associated fd.
+ *
+ * These arcs begin the process of saving the device state and will return a
+ * new data_fd. The migration driver may perform actions such as enabling
+ * dirty logging of device state when entering PRE_COPY or PER_COPY_P2P.
+ *
+ * Each arc does not change the device operation, the device remains
+ * RUNNING, P2P quiesced or in STOP. The STOP_COPY state is described below
+ * in PRE_COPY_P2P -> STOP_COPY.
*
+ * PRE_COPY -> PRE_COPY_P2P
+ * Entering PRE_COPY_P2P continues all the behaviors of PRE_COPY above.
+ * However, while in the PRE_COPY_P2P state, the device is partially running
+ * in the P2P quiescent state defined below, like RUNNING_P2P.
+ *
+ * PRE_COPY_P2P -> PRE_COPY
+ * This arc allows returning the device to a full RUNNING behavior while
+ * continuing all the behaviors of PRE_COPY.
+ *
+ * PRE_COPY_P2P -> STOP_COPY
* While in the STOP_COPY state the device has the same behavior as STOP
* with the addition that the data transfers session continues to stream the
* migration state. End of stream on the FD indicates the entire device
@@ -939,6 +980,13 @@ struct vfio_device_feature_mig_state {
* device state for this arc if required to prepare the device to receive the
* migration data.
*
+ * STOP_COPY -> PRE_COPY
+ * STOP_COPY -> PRE_COPY_P2P
+ * These arcs are not permitted and return error if requested. Future
+ * revisions of this API may define behaviors for these arcs, in this case
+ * support will be discoverable by a new flag in
+ * VFIO_DEVICE_FEATURE_MIGRATION.
+ *
* any -> ERROR
* ERROR cannot be specified as a device state, however any transition request
* can be failed with an errno return and may then move the device_state into
@@ -950,7 +998,7 @@ struct vfio_device_feature_mig_state {
* The optional peer to peer (P2P) quiescent state is intended to be a quiescent
* state for the device for the purposes of managing multiple devices within a
* user context where peer-to-peer DMA between devices may be active. The
- * RUNNING_P2P states must prevent the device from initiating
+ * RUNNING_P2P and PRE_COPY_P2P states must prevent the device from initiating
* any new P2P DMA transactions. If the device can identify P2P transactions
* then it can stop only P2P DMA, otherwise it must stop all DMA. The migration
* driver must complete any such outstanding operations prior to completing the
@@ -963,6 +1011,8 @@ struct vfio_device_feature_mig_state {
* above FSM arcs. As there are multiple paths through the FSM arcs the path
* should be selected based on the following rules:
* - Select the shortest path.
+ * - The path cannot have saving group states as interior arcs, only
+ * starting/end states.
* Refer to vfio_mig_get_next_state() for the result of the algorithm.
*
* The automatic transit through the FSM arcs that make up the combination
@@ -976,6 +1026,9 @@ struct vfio_device_feature_mig_state {
* support them. The user can discover if these states are supported by using
* VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can
* avoid knowing about these optional states if the kernel driver supports them.
+ *
+ * Arcs touching PRE_COPY and PRE_COPY_P2P are removed if support for PRE_COPY
+ * is not present.
*/
enum vfio_device_mig_state {
VFIO_DEVICE_STATE_ERROR = 0,
@@ -984,8 +1037,70 @@ enum vfio_device_mig_state {
VFIO_DEVICE_STATE_STOP_COPY = 3,
VFIO_DEVICE_STATE_RESUMING = 4,
VFIO_DEVICE_STATE_RUNNING_P2P = 5,
+ VFIO_DEVICE_STATE_PRE_COPY = 6,
+ VFIO_DEVICE_STATE_PRE_COPY_P2P = 7,
+};
+
+/**
+ * VFIO_MIG_GET_PRECOPY_INFO - _IO(VFIO_TYPE, VFIO_BASE + 21)
+ *
+ * This ioctl is used on the migration data FD in the precopy phase of the
+ * migration data transfer. It returns an estimate of the current data sizes
+ * remaining to be transferred. It allows the user to judge when it is
+ * appropriate to leave PRE_COPY for STOP_COPY.
+ *
+ * This ioctl is valid only in PRE_COPY states and kernel driver should
+ * return -EINVAL from any other migration state.
+ *
+ * The vfio_precopy_info data structure returned by this ioctl provides
+ * estimates of data available from the device during the PRE_COPY states.
+ * This estimate is split into two categories, initial_bytes and
+ * dirty_bytes.
+ *
+ * The initial_bytes field indicates the amount of initial precopy
+ * data available from the device. This field should have a non-zero initial
+ * value and decrease as migration data is read from the device.
+ * It is recommended to leave PRE_COPY for STOP_COPY only after this field
+ * reaches zero. Leaving PRE_COPY earlier might make things slower.
+ *
+ * The dirty_bytes field tracks device state changes relative to data
+ * previously retrieved. This field starts at zero and may increase as
+ * the internal device state is modified or decrease as that modified
+ * state is read from the device.
+ *
+ * Userspace may use the combination of these fields to estimate the
+ * potential data size available during the PRE_COPY phases, as well as
+ * trends relative to the rate the device is dirtying its internal
+ * state, but these fields are not required to have any bearing relative
+ * to the data size available during the STOP_COPY phase.
+ *
+ * Drivers have a lot of flexibility in when and what they transfer during the
+ * PRE_COPY phase, and how they report this from VFIO_MIG_GET_PRECOPY_INFO.
+ *
+ * During pre-copy the migration data FD has a temporary "end of stream" that is
+ * reached when both initial_bytes and dirty_byte are zero. For instance, this
+ * may indicate that the device is idle and not currently dirtying any internal
+ * state. When read() is done on this temporary end of stream the kernel driver
+ * should return ENOMSG from read(). Userspace can wait for more data (which may
+ * never come) by using poll.
+ *
+ * Once in STOP_COPY the migration data FD has a permanent end of stream
+ * signaled in the usual way by read() always returning 0 and poll always
+ * returning readable. ENOMSG may not be returned in STOP_COPY.
+ * Support for this ioctl is mandatory if a driver claims to support
+ * VFIO_MIGRATION_PRE_COPY.
+ *
+ * Return: 0 on success, -1 and errno set on failure.
+ */
+struct vfio_precopy_info {
+ __u32 argsz;
+ __u32 flags;
+ __aligned_u64 initial_bytes;
+ __aligned_u64 dirty_bytes;
};
+#define VFIO_MIG_GET_PRECOPY_INFO _IO(VFIO_TYPE, VFIO_BASE + 21)
+
/*
* Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power
* state with the platform-based power management. Device use of lower power
@@ -1128,6 +1243,19 @@ struct vfio_device_feature_dma_logging_report {
#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8
+/*
+ * Upon VFIO_DEVICE_FEATURE_GET read back the estimated data length that will
+ * be required to complete stop copy.
+ *
+ * Note: Can be called on each device state.
+ */
+
+struct vfio_device_feature_mig_data_size {
+ __aligned_u64 stop_copy_length;
+};
+
+#define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**