diff options
32 files changed, 1761 insertions, 481 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index a53a1a29b10c..23d628857400 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22212,6 +22212,7 @@ F: include/uapi/linux/virtio_console.h VIRTIO CORE AND NET DRIVERS M: "Michael S. Tsirkin" <mst@redhat.com> M: Jason Wang <jasowang@redhat.com> +R: Xuan Zhuo <xuanzhuo@linux.alibaba.com> L: virtualization@lists.linux-foundation.org S: Maintained F: Documentation/ABI/testing/sysfs-bus-vdpa @@ -22225,6 +22226,7 @@ F: drivers/vdpa/ F: drivers/virtio/ F: include/linux/vdpa.h F: include/linux/virtio*.h +F: include/linux/vringh.h F: include/uapi/linux/virtio_*.h F: tools/virtio/ diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 954fc31b4bc7..02922768b129 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -391,7 +391,7 @@ static void virtio_ccw_drop_indicator(struct virtio_ccw_device *vcdev, ccw_device_dma_free(vcdev->cdev, thinint_area, sizeof(*thinint_area)); } -static bool virtio_ccw_kvm_notify(struct virtqueue *vq) +static inline bool virtio_ccw_do_kvm_notify(struct virtqueue *vq, u32 data) { struct virtio_ccw_vq_info *info = vq->priv; struct virtio_ccw_device *vcdev; @@ -402,12 +402,22 @@ static bool virtio_ccw_kvm_notify(struct virtqueue *vq) BUILD_BUG_ON(sizeof(struct subchannel_id) != sizeof(unsigned int)); info->cookie = kvm_hypercall3(KVM_S390_VIRTIO_CCW_NOTIFY, *((unsigned int *)&schid), - vq->index, info->cookie); + data, info->cookie); if (info->cookie < 0) return false; return true; } +static bool virtio_ccw_kvm_notify(struct virtqueue *vq) +{ + return virtio_ccw_do_kvm_notify(vq, vq->index); +} + +static bool virtio_ccw_kvm_notify_with_data(struct virtqueue *vq) +{ + return virtio_ccw_do_kvm_notify(vq, vring_notification_data(vq)); +} + static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev, struct ccw1 *ccw, int index) { @@ -495,6 +505,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, struct ccw1 *ccw) { struct virtio_ccw_device *vcdev = to_vc_device(vdev); + bool (*notify)(struct virtqueue *vq); int err; struct virtqueue *vq = NULL; struct virtio_ccw_vq_info *info; @@ -502,6 +513,11 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, unsigned long flags; bool may_reduce; + if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) + notify = virtio_ccw_kvm_notify_with_data; + else + notify = virtio_ccw_kvm_notify; + /* Allocate queue. */ info = kzalloc(sizeof(struct virtio_ccw_vq_info), GFP_KERNEL); if (!info) { @@ -524,7 +540,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, may_reduce = vcdev->revision > 0; vq = vring_create_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev, true, may_reduce, ctx, - virtio_ccw_kvm_notify, callback, name); + notify, callback, name); if (!vq) { /* For now, we fail if we can't get the requested size. */ diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 195963b82b63..e29e32b306ad 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -778,12 +778,28 @@ static bool vq_is_tx(u16 idx) return idx % 2; } -static u16 get_features_12_3(u64 features) +enum { + MLX5_VIRTIO_NET_F_MRG_RXBUF = 2, + MLX5_VIRTIO_NET_F_HOST_ECN = 4, + MLX5_VIRTIO_NET_F_GUEST_ECN = 6, + MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7, + MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8, + MLX5_VIRTIO_NET_F_GUEST_CSUM = 9, + MLX5_VIRTIO_NET_F_CSUM = 10, + MLX5_VIRTIO_NET_F_HOST_TSO6 = 11, + MLX5_VIRTIO_NET_F_HOST_TSO4 = 12, +}; + +static u16 get_features(u64 features) { - return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) | - (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) | - (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) | - (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6); + return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) | + (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) | + (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) | + (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4); } static bool counters_supported(const struct mlx5_vdpa_dev *mvdev) @@ -797,6 +813,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in); u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {}; void *obj_context; + u16 mlx_features; void *cmd_hdr; void *vq_ctx; void *in; @@ -812,6 +829,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque goto err_alloc; } + mlx_features = get_features(ndev->mvdev.actual_features); cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr); MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); @@ -822,7 +840,9 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx); MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx); MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3, - get_features_12_3(ndev->mvdev.actual_features)); + mlx_features >> 3); + MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0, + mlx_features & 7); vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context); MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev)); @@ -2171,23 +2191,27 @@ static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx) return MLX5_VDPA_DATAVQ_GROUP; } -enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9, - MLX5_VIRTIO_NET_F_CSUM = 1 << 10, - MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11, - MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12, -}; - static u64 mlx_to_vritio_features(u16 dev_features) { u64 result = 0; - if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM) + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF)) + result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN)) + result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4)) + result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4); + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM)) result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM); - if (dev_features & MLX5_VIRTIO_NET_F_CSUM) + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM)) result |= BIT_ULL(VIRTIO_NET_F_CSUM); - if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6) + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6)) result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6); - if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4) + if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4)) result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4); return result; @@ -2298,6 +2322,113 @@ static void update_cvq_info(struct mlx5_vdpa_dev *mvdev) } } +static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) +{ + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; + int err; + + MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE); + MLX5_SET(query_vport_state_in, in, op_mod, opmod); + MLX5_SET(query_vport_state_in, in, vport_number, vport); + if (vport) + MLX5_SET(query_vport_state_in, in, other_vport, 1); + + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); + if (err) + return 0; + + return MLX5_GET(query_vport_state_out, out, state); +} + +static bool get_link_state(struct mlx5_vdpa_dev *mvdev) +{ + if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) == + VPORT_STATE_UP) + return true; + + return false; +} + +static void update_carrier(struct work_struct *work) +{ + struct mlx5_vdpa_wq_ent *wqent; + struct mlx5_vdpa_dev *mvdev; + struct mlx5_vdpa_net *ndev; + + wqent = container_of(work, struct mlx5_vdpa_wq_ent, work); + mvdev = wqent->mvdev; + ndev = to_mlx5_vdpa_ndev(mvdev); + if (get_link_state(mvdev)) + ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); + else + ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); + + if (ndev->config_cb.callback) + ndev->config_cb.callback(ndev->config_cb.private); + + kfree(wqent); +} + +static int queue_link_work(struct mlx5_vdpa_net *ndev) +{ + struct mlx5_vdpa_wq_ent *wqent; + + wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); + if (!wqent) + return -ENOMEM; + + wqent->mvdev = &ndev->mvdev; + INIT_WORK(&wqent->work, update_carrier); + queue_work(ndev->mvdev.wq, &wqent->work); + return 0; +} + +static int event_handler(struct notifier_block *nb, unsigned long event, void *param) +{ + struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb); + struct mlx5_eqe *eqe = param; + int ret = NOTIFY_DONE; + + if (event == MLX5_EVENT_TYPE_PORT_CHANGE) { + switch (eqe->sub_type) { + case MLX5_PORT_CHANGE_SUBTYPE_DOWN: + case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + if (queue_link_work(ndev)) + return NOTIFY_DONE; + + ret = NOTIFY_OK; + break; + default: + return NOTIFY_DONE; + } + return ret; + } + return ret; +} + +static void register_link_notifier(struct mlx5_vdpa_net *ndev) +{ + if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS))) + return; + + ndev->nb.notifier_call = event_handler; + mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb); + ndev->nb_registered = true; + queue_link_work(ndev); +} + +static void unregister_link_notifier(struct mlx5_vdpa_net *ndev) +{ + if (!ndev->nb_registered) + return; + + ndev->nb_registered = false; + mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb); + if (ndev->mvdev.wq) + flush_workqueue(ndev->mvdev.wq); +} + static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); @@ -2567,10 +2698,11 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n"); goto err_setup; } + register_link_notifier(ndev); err = setup_driver(mvdev); if (err) { mlx5_vdpa_warn(mvdev, "failed to setup driver\n"); - goto err_setup; + goto err_driver; } } else { mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n"); @@ -2582,6 +2714,8 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) up_write(&ndev->reslock); return; +err_driver: + unregister_link_notifier(ndev); err_setup: mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; @@ -2607,6 +2741,7 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) mlx5_vdpa_info(mvdev, "performing device reset\n"); down_write(&ndev->reslock); + unregister_link_notifier(ndev); teardown_driver(ndev); clear_vqs_ready(ndev); mlx5_vdpa_destroy_mr(&ndev->mvdev); @@ -2861,9 +2996,7 @@ static int mlx5_vdpa_suspend(struct vdpa_device *vdev) mlx5_vdpa_info(mvdev, "suspending device\n"); down_write(&ndev->reslock); - ndev->nb_registered = false; - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); - flush_workqueue(ndev->mvdev.wq); + unregister_link_notifier(ndev); for (i = 0; i < ndev->cur_num_vqs; i++) { mvq = &ndev->vqs[i]; suspend_vq(ndev, mvq); @@ -3000,84 +3133,6 @@ struct mlx5_vdpa_mgmtdev { struct mlx5_vdpa_net *ndev; }; -static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) -{ - u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; - u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; - int err; - - MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE); - MLX5_SET(query_vport_state_in, in, op_mod, opmod); - MLX5_SET(query_vport_state_in, in, vport_number, vport); - if (vport) - MLX5_SET(query_vport_state_in, in, other_vport, 1); - - err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); - if (err) - return 0; - - return MLX5_GET(query_vport_state_out, out, state); -} - -static bool get_link_state(struct mlx5_vdpa_dev *mvdev) -{ - if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) == - VPORT_STATE_UP) - return true; - - return false; -} - -static void update_carrier(struct work_struct *work) -{ - struct mlx5_vdpa_wq_ent *wqent; - struct mlx5_vdpa_dev *mvdev; - struct mlx5_vdpa_net *ndev; - - wqent = container_of(work, struct mlx5_vdpa_wq_ent, work); - mvdev = wqent->mvdev; - ndev = to_mlx5_vdpa_ndev(mvdev); - if (get_link_state(mvdev)) - ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); - else - ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP); - - if (ndev->nb_registered && ndev->config_cb.callback) - ndev->config_cb.callback(ndev->config_cb.private); - - kfree(wqent); -} - -static int event_handler(struct notifier_block *nb, unsigned long event, void *param) -{ - struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb); - struct mlx5_eqe *eqe = param; - int ret = NOTIFY_DONE; - struct mlx5_vdpa_wq_ent *wqent; - - if (event == MLX5_EVENT_TYPE_PORT_CHANGE) { - if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS))) - return NOTIFY_DONE; - switch (eqe->sub_type) { - case MLX5_PORT_CHANGE_SUBTYPE_DOWN: - case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: - wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); - if (!wqent) - return NOTIFY_DONE; - - wqent->mvdev = &ndev->mvdev; - INIT_WORK(&wqent->work, update_carrier); - queue_work(ndev->mvdev.wq, &wqent->work); - ret = NOTIFY_OK; - break; - default: - return NOTIFY_DONE; - } - return ret; - } - return ret; -} - static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu) { int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); @@ -3127,6 +3182,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, return -EINVAL; } device_features &= add_config->device_features; + } else { + device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF); } if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) && device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) { @@ -3258,9 +3315,6 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, goto err_res2; } - ndev->nb.notifier_call = event_handler; - mlx5_notifier_register(mdev, &ndev->nb); - ndev->nb_registered = true; mvdev->vdev.mdev = &mgtdev->mgtdev; err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1); if (err) @@ -3294,10 +3348,7 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * mlx5_vdpa_remove_debugfs(ndev->debugfs); ndev->debugfs = NULL; - if (ndev->nb_registered) { - ndev->nb_registered = false; - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); - } + unregister_link_notifier(ndev); wq = mvdev->wq; mvdev->wq = NULL; destroy_workqueue(wq); diff --git a/drivers/vdpa/solidrun/Makefile b/drivers/vdpa/solidrun/Makefile index c0aa3415bf7b..9116252cd5fa 100644 --- a/drivers/vdpa/solidrun/Makefile +++ b/drivers/vdpa/solidrun/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SNET_VDPA) += snet_vdpa.o snet_vdpa-$(CONFIG_SNET_VDPA) += snet_main.o +snet_vdpa-$(CONFIG_SNET_VDPA) += snet_ctrl.o ifdef CONFIG_HWMON snet_vdpa-$(CONFIG_SNET_VDPA) += snet_hwmon.o endif diff --git a/drivers/vdpa/solidrun/snet_ctrl.c b/drivers/vdpa/solidrun/snet_ctrl.c new file mode 100644 index 000000000000..3858738643b4 --- /dev/null +++ b/drivers/vdpa/solidrun/snet_ctrl.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SolidRun DPU driver for control plane + * + * Copyright (C) 2022-2023 SolidRun + * + * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> + * + */ + +#include <linux/iopoll.h> + +#include "snet_vdpa.h" + +enum snet_ctrl_opcodes { + SNET_CTRL_OP_DESTROY = 1, + SNET_CTRL_OP_READ_VQ_STATE, + SNET_CTRL_OP_SUSPEND, +}; + +#define SNET_CTRL_TIMEOUT 2000000 + +#define SNET_CTRL_DATA_SIZE_MASK 0x0000FFFF +#define SNET_CTRL_IN_PROCESS_MASK 0x00010000 +#define SNET_CTRL_CHUNK_RDY_MASK 0x00020000 +#define SNET_CTRL_ERROR_MASK 0x0FFC0000 + +#define SNET_VAL_TO_ERR(val) (-(((val) & SNET_CTRL_ERROR_MASK) >> 18)) +#define SNET_EMPTY_CTRL(val) (((val) & SNET_CTRL_ERROR_MASK) || \ + !((val) & SNET_CTRL_IN_PROCESS_MASK)) +#define SNET_DATA_READY(val) ((val) & (SNET_CTRL_ERROR_MASK | SNET_CTRL_CHUNK_RDY_MASK)) + +/* Control register used to read data from the DPU */ +struct snet_ctrl_reg_ctrl { + /* Chunk size in 4B words */ + u16 data_size; + /* We are in the middle of a command */ + u16 in_process:1; + /* A data chunk is ready and can be consumed */ + u16 chunk_ready:1; + /* Error code */ + u16 error:10; + /* Saved for future usage */ + u16 rsvd:4; +}; + +/* Opcode register */ +struct snet_ctrl_reg_op { + u16 opcode; + /* Only if VQ index is relevant for the command */ + u16 vq_idx; +}; + +struct snet_ctrl_regs { + struct snet_ctrl_reg_op op; + struct snet_ctrl_reg_ctrl ctrl; + u32 rsvd; + u32 data[]; +}; + +static struct snet_ctrl_regs __iomem *snet_get_ctrl(struct snet *snet) +{ + return snet->bar + snet->psnet->cfg.ctrl_off; +} + +static int snet_wait_for_empty_ctrl(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->ctrl, val, SNET_EMPTY_CTRL(val), 10, + SNET_CTRL_TIMEOUT); +} + +static int snet_wait_for_empty_op(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->op, val, !val, 10, SNET_CTRL_TIMEOUT); +} + +static int snet_wait_for_data(struct snet_ctrl_regs __iomem *regs) +{ + u32 val; + + return readx_poll_timeout(ioread32, ®s->ctrl, val, SNET_DATA_READY(val), 10, + SNET_CTRL_TIMEOUT); +} + +static u32 snet_read32_word(struct snet_ctrl_regs __iomem *ctrl_regs, u16 word_idx) +{ + return ioread32(&ctrl_regs->data[word_idx]); +} + +static u32 snet_read_ctrl(struct snet_ctrl_regs __iomem *ctrl_regs) +{ + return ioread32(&ctrl_regs->ctrl); +} + +static void snet_write_ctrl(struct snet_ctrl_regs __iomem *ctrl_regs, u32 val) +{ + iowrite32(val, &ctrl_regs->ctrl); +} + +static void snet_write_op(struct snet_ctrl_regs __iomem *ctrl_regs, u32 val) +{ + iowrite32(val, &ctrl_regs->op); +} + +static int snet_wait_for_dpu_completion(struct snet_ctrl_regs __iomem *ctrl_regs) +{ + /* Wait until the DPU finishes completely. + * It will clear the opcode register. + */ + return snet_wait_for_empty_op(ctrl_regs); +} + +/* Reading ctrl from the DPU: + * buf_size must be 4B aligned + * + * Steps: + * + * (1) Verify that the DPU is not in the middle of another operation by + * reading the in_process and error bits in the control register. + * (2) Write the request opcode and the VQ idx in the opcode register + * and write the buffer size in the control register. + * (3) Start readind chunks of data, chunk_ready bit indicates that a + * data chunk is available, we signal that we read the data by clearing the bit. + * (4) Detect that the transfer is completed when the in_process bit + * in the control register is cleared or when the an error appears. + */ +static int snet_ctrl_read_from_dpu(struct snet *snet, u16 opcode, u16 vq_idx, void *buffer, + u32 buf_size) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + u32 *bfr_ptr = (u32 *)buffer; + u32 val; + u16 buf_words; + int ret; + u16 words, i, tot_words = 0; + + /* Supported for config 2+ */ + if (!SNET_CFG_VER(snet, 2)) + return -EOPNOTSUPP; + + if (!IS_ALIGNED(buf_size, 4)) + return -EINVAL; + + mutex_lock(&snet->ctrl_lock); + + buf_words = buf_size / 4; + + /* Make sure control register is empty */ + ret = snet_wait_for_empty_ctrl(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control data to be consumed\n"); + goto exit; + } + + /* We need to write the buffer size in the control register, and the opcode + vq index in + * the opcode register. + * We use a spinlock to serialize the writes. + */ + spin_lock(&snet->ctrl_spinlock); + + snet_write_ctrl(regs, buf_words); + snet_write_op(regs, opcode | (vq_idx << 16)); + + spin_unlock(&snet->ctrl_spinlock); + + while (buf_words != tot_words) { + ret = snet_wait_for_data(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for control data\n"); + goto exit; + } + + val = snet_read_ctrl(regs); + + /* Error? */ + if (val & SNET_CTRL_ERROR_MASK) { + ret = SNET_VAL_TO_ERR(val); + SNET_WARN(pdev, "Error while reading control data from DPU, err %d\n", ret); + goto exit; + } + + words = min_t(u16, val & SNET_CTRL_DATA_SIZE_MASK, buf_words - tot_words); + + for (i = 0; i < words; i++) { + *bfr_ptr = snet_read32_word(regs, i); + bfr_ptr++; + } + + tot_words += words; + + /* Is the job completed? */ + if (!(val & SNET_CTRL_IN_PROCESS_MASK)) + break; + + /* Clear the chunk ready bit and continue */ + val &= ~SNET_CTRL_CHUNK_RDY_MASK; + snet_write_ctrl(regs, val); + } + + ret = snet_wait_for_dpu_completion(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for the DPU to complete a control command\n"); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +/* Send a control message to the DPU using the old mechanism + * used with config version 1. + */ +static int snet_send_ctrl_msg_old(struct snet *snet, u32 opcode) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + int ret; + + mutex_lock(&snet->ctrl_lock); + + /* Old mechanism uses just 1 register, the opcode register. + * Make sure that the opcode register is empty, and that the DPU isn't + * processing an old message. + */ + ret = snet_wait_for_empty_op(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control message to be ACKed\n"); + goto exit; + } + + /* Write the message */ + snet_write_op(regs, opcode); + + /* DPU ACKs the message by clearing the opcode register */ + ret = snet_wait_for_empty_op(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for a control message to be ACKed\n"); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +/* Send a control message to the DPU. + * A control message is a message without payload. + */ +static int snet_send_ctrl_msg(struct snet *snet, u16 opcode, u16 vq_idx) +{ + struct pci_dev *pdev = snet->pdev; + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + u32 val; + int ret; + + /* If config version is not 2+, use the old mechanism */ + if (!SNET_CFG_VER(snet, 2)) + return snet_send_ctrl_msg_old(snet, opcode); + + mutex_lock(&snet->ctrl_lock); + + /* Make sure control register is empty */ + ret = snet_wait_for_empty_ctrl(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for previous control data to be consumed\n"); + goto exit; + } + + /* We need to clear the control register and write the opcode + vq index in the opcode + * register. + * We use a spinlock to serialize the writes. + */ + spin_lock(&snet->ctrl_spinlock); + + snet_write_ctrl(regs, 0); + snet_write_op(regs, opcode | (vq_idx << 16)); + + spin_unlock(&snet->ctrl_spinlock); + + /* The DPU ACKs control messages by setting the chunk ready bit + * without data. + */ + ret = snet_wait_for_data(regs); + if (ret) { + SNET_WARN(pdev, "Timeout waiting for control message to be ACKed\n"); + goto exit; + } + + /* Check for errors */ + val = snet_read_ctrl(regs); + ret = SNET_VAL_TO_ERR(val); + + /* Clear the chunk ready bit */ + val &= ~SNET_CTRL_CHUNK_RDY_MASK; + snet_write_ctrl(regs, val); + + ret = snet_wait_for_dpu_completion(regs); + if (ret) + SNET_WARN(pdev, "Timeout waiting for DPU to complete a control command, err %d\n", + ret); + +exit: + mutex_unlock(&snet->ctrl_lock); + return ret; +} + +void snet_ctrl_clear(struct snet *snet) +{ + struct snet_ctrl_regs __iomem *regs = snet_get_ctrl(snet); + + snet_write_op(regs, 0); +} + +int snet_destroy_dev(struct snet *snet) +{ + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_DESTROY, 0); +} + +int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state *state) +{ + return snet_ctrl_read_from_dpu(snet, SNET_CTRL_OP_READ_VQ_STATE, idx, state, + sizeof(*state)); +} + +int snet_suspend_dev(struct snet *snet) +{ + return snet_send_ctrl_msg(snet, SNET_CTRL_OP_SUSPEND, 0); +} diff --git a/drivers/vdpa/solidrun/snet_hwmon.c b/drivers/vdpa/solidrun/snet_hwmon.c index e695e36ff753..42c87387a0f1 100644 --- a/drivers/vdpa/solidrun/snet_hwmon.c +++ b/drivers/vdpa/solidrun/snet_hwmon.c @@ -2,7 +2,7 @@ /* * SolidRun DPU driver for control plane * - * Copyright (C) 2022 SolidRun + * Copyright (C) 2022-2023 SolidRun * * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> * diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c index 68de727398ed..cdcd84ce4f5a 100644 --- a/drivers/vdpa/solidrun/snet_main.c +++ b/drivers/vdpa/solidrun/snet_main.c @@ -2,7 +2,7 @@ /* * SolidRun DPU driver for control plane * - * Copyright (C) 2022 SolidRun + * Copyright (C) 2022-2023 SolidRun * * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> * @@ -16,14 +16,12 @@ /* SNET signature */ #define SNET_SIGNATURE 0xD0D06363 /* Max. config version that we can work with */ -#define SNET_CFG_VERSION 0x1 +#define SNET_CFG_VERSION 0x2 /* Queue align */ #define SNET_QUEUE_ALIGNMENT PAGE_SIZE /* Kick value to notify that new data is available */ #define SNET_KICK_VAL 0x1 #define SNET_CONFIG_OFF 0x0 -/* ACK timeout for a message */ -#define SNET_ACK_TIMEOUT 2000000 /* How long we are willing to wait for a SNET device */ #define SNET_DETECT_TIMEOUT 5000000 /* How long should we wait for the DPU to read our config */ @@ -32,61 +30,16 @@ #define SNET_GENERAL_CFG_LEN 36 #define SNET_GENERAL_CFG_VQ_LEN 40 -enum snet_msg { - SNET_MSG_DESTROY = 1, -}; - static struct snet *vdpa_to_snet(struct vdpa_device *vdpa) { return container_of(vdpa, struct snet, vdpa); } -static int snet_wait_for_msg_ack(struct snet *snet) -{ - struct pci_dev *pdev = snet->pdev; - int ret; - u32 val; - - /* The DPU will clear the messages offset once messages - * are processed. - */ - ret = readx_poll_timeout(ioread32, snet->bar + snet->psnet->cfg.msg_off, - val, !val, 10, SNET_ACK_TIMEOUT); - if (ret) - SNET_WARN(pdev, "Timeout waiting for message ACK\n"); - - return ret; -} - -/* Sends a message to the DPU. - * If blocking is set, the function will return once the - * message was processed by the DPU (or timeout). - */ -static int snet_send_msg(struct snet *snet, u32 msg, bool blocking) -{ - int ret = 0; - - /* Make sure the DPU acked last message before issuing a new one */ - ret = snet_wait_for_msg_ack(snet); - if (ret) - return ret; - - /* Write the message */ - snet_write32(snet, snet->psnet->cfg.msg_off, msg); - - if (blocking) - ret = snet_wait_for_msg_ack(snet); - else /* If non-blocking, flush the write by issuing a read */ - snet_read32(snet, snet->psnet->cfg.msg_off); - - return ret; -} - static irqreturn_t snet_cfg_irq_hndlr(int irq, void *data) { struct snet *snet = data; /* Call callback if any */ - if (snet->cb.callback) + if (likely(snet->cb.callback)) return snet->cb.callback(snet->cb.private); return IRQ_HANDLED; @@ -96,7 +49,7 @@ static irqreturn_t snet_vq_irq_hndlr(int irq, void *data) { struct snet_vq *vq = data; /* Call callback if any */ - if (vq->cb.callback) + if (likely(vq->cb.callback)) return vq->cb.callback(vq->cb.private); return IRQ_HANDLED; @@ -153,12 +106,24 @@ static void snet_kick_vq(struct vdpa_device *vdev, u16 idx) { struct snet *snet = vdpa_to_snet(vdev); /* not ready - ignore */ - if (!snet->vqs[idx]->ready) + if (unlikely(!snet->vqs[idx]->ready)) return; iowrite32(SNET_KICK_VAL, snet->vqs[idx]->kick_ptr); } +static void snet_kick_vq_with_data(struct vdpa_device *vdev, u32 data) +{ + struct snet *snet = vdpa_to_snet(vdev); + u16 idx = data & 0xFFFF; + + /* not ready - ignore */ + if (unlikely(!snet->vqs[idx]->ready)) + return; + + iowrite32((data & 0xFFFF0000) | SNET_KICK_VAL, snet->vqs[idx]->kick_ptr); +} + static void snet_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb) { struct snet *snet = vdpa_to_snet(vdev); @@ -181,33 +146,48 @@ static bool snet_get_vq_ready(struct vdpa_device *vdev, u16 idx) return snet->vqs[idx]->ready; } -static int snet_set_vq_state(struct vdpa_device *vdev, u16 idx, const struct vdpa_vq_state *state) +static bool snet_vq_state_is_initial(struct snet *snet, const struct vdpa_vq_state *state) { - struct snet *snet = vdpa_to_snet(vdev); - /* Setting the VQ state is not supported. - * If the asked state is the same as the initial one - * we can ignore it. - */ if (SNET_HAS_FEATURE(snet, VIRTIO_F_RING_PACKED)) { const struct vdpa_vq_state_packed *p = &state->packed; if (p->last_avail_counter == 1 && p->last_used_counter == 1 && p->last_avail_idx == 0 && p->last_used_idx == 0) - return 0; + return true; } else { const struct vdpa_vq_state_split *s = &state->split; if (s->avail_index == 0) - return 0; + return true; } + return false; +} + +static int snet_set_vq_state(struct vdpa_device *vdev, u16 idx, const struct vdpa_vq_state *state) +{ + struct snet *snet = vdpa_to_snet(vdev); + + /* We can set any state for config version 2+ */ + if (SNET_CFG_VER(snet, 2)) { + memcpy(&snet->vqs[idx]->vq_state, state, sizeof(*state)); + return 0; + } + + /* Older config - we can't set the VQ state. + * Return 0 only if this is the initial state we use in the DPU. + */ + if (snet_vq_state_is_initial(snet, state)) + return 0; + return -EOPNOTSUPP; } static int snet_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state) { - /* Not supported */ - return -EOPNOTSUPP; + struct snet *snet = vdpa_to_snet(vdev); + + return snet_read_vq_state(snet, idx, state); } static int snet_get_vq_irq(struct vdpa_device *vdev, u16 idx) @@ -232,9 +212,9 @@ static int snet_reset_dev(struct snet *snet) if (!snet->status) return 0; - /* If DPU started, send a destroy message */ + /* If DPU started, destroy it */ if (snet->status & VIRTIO_CONFIG_S_DRIVER_OK) - ret = snet_send_msg(snet, SNET_MSG_DESTROY, true); + ret = snet_destroy_dev(snet); /* Clear VQs */ for (i = 0; i < snet->cfg->vq_num; i++) { @@ -258,7 +238,7 @@ static int snet_reset_dev(struct snet *snet) snet->dpu_ready = false; if (ret) - SNET_WARN(pdev, "Incomplete reset to SNET[%u] device\n", snet->sid); + SNET_WARN(pdev, "Incomplete reset to SNET[%u] device, err: %d\n", snet->sid, ret); else SNET_DBG(pdev, "Reset SNET[%u] device\n", snet->sid); @@ -356,7 +336,7 @@ static int snet_write_conf(struct snet *snet) * | DESC AREA | * | DEVICE AREA | * | DRIVER AREA | - * | RESERVED | + * | VQ STATE (CFG 2+) | RSVD | * * Magic number should be written last, this is the DPU indication that the data is ready */ @@ -391,12 +371,15 @@ static int snet_write_conf(struct snet *snet) off += 8; snet_write64(snet, off, snet->vqs[i]->driver_area); off += 8; + /* Write VQ state if config version is 2+ */ + if (SNET_CFG_VER(snet, 2)) + snet_write32(snet, off, *(u32 *)&snet->vqs[i]->vq_state); + off += 4; + /* Ignore reserved */ - off += 8; + off += 4; } - /* Clear snet messages address for this device */ - snet_write32(snet, snet->psnet->cfg.msg_off, 0); /* Write magic number - data is ready */ snet_write32(snet, snet->psnet->cfg.host_cfg_off, SNET_SIGNATURE); @@ -512,10 +495,25 @@ static void snet_set_config(struct vdpa_device *vdev, unsigned int offset, iowrite8(*buf_ptr++, cfg_ptr + i); } +static int snet_suspend(struct vdpa_device *vdev) +{ + struct snet *snet = vdpa_to_snet(vdev); + int ret; + + ret = snet_suspend_dev(snet); + if (ret) + SNET_ERR(snet->pdev, "SNET[%u] suspend failed, err: %d\n", snet->sid, ret); + else + SNET_DBG(snet->pdev, "Suspend SNET[%u] device\n", snet->sid); + + return ret; +} + static const struct vdpa_config_ops snet_config_ops = { .set_vq_address = snet_set_vq_address, .set_vq_num = snet_set_vq_num, .kick_vq = snet_kick_vq, + .kick_vq_with_data = snet_kick_vq_with_data, .set_vq_cb = snet_set_vq_cb, .set_vq_ready = snet_set_vq_ready, .get_vq_ready = snet_get_vq_ready, @@ -537,6 +535,7 @@ static const struct vdpa_config_ops snet_config_ops = { .set_status = snet_set_status, .get_config = snet_get_config, .set_config = snet_set_config, + .suspend = snet_suspend, }; static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet) @@ -697,7 +696,7 @@ static int psnet_read_cfg(struct pci_dev *pdev, struct psnet *psnet) off += 4; cfg->hwmon_off = psnet_read32(psnet, off); off += 4; - cfg->msg_off = psnet_read32(psnet, off); + cfg->ctrl_off = psnet_read32(psnet, off); off += 4; cfg->flags = psnet_read32(psnet, off); off += 4; @@ -997,6 +996,10 @@ static int snet_vdpa_probe_vf(struct pci_dev *pdev) goto free_irqs; } + /* Init control mutex and spinlock */ + mutex_init(&snet->ctrl_lock); + spin_lock_init(&snet->ctrl_spinlock); + /* Save pci device pointer */ snet->pdev = pdev; snet->psnet = psnet; @@ -1013,6 +1016,9 @@ static int snet_vdpa_probe_vf(struct pci_dev *pdev) /* Create a VirtIO config pointer */ snet->cfg->virtio_cfg = snet->bar + snet->psnet->cfg.virtio_cfg_off; + /* Clear control registers */ + snet_ctrl_clear(snet); + pci_set_master(pdev); pci_set_drvdata(pdev, snet); diff --git a/drivers/vdpa/solidrun/snet_vdpa.h b/drivers/vdpa/solidrun/snet_vdpa.h index b7f34169053f..3c78d4e7d485 100644 --- a/drivers/vdpa/solidrun/snet_vdpa.h +++ b/drivers/vdpa/solidrun/snet_vdpa.h @@ -2,7 +2,7 @@ /* * SolidRun DPU driver for control plane * - * Copyright (C) 2022 SolidRun + * Copyright (C) 2022-2023 SolidRun * * Author: Alvaro Karsz <alvaro.karsz@solid-run.com> * @@ -20,10 +20,15 @@ #define SNET_INFO(pdev, fmt, ...) dev_info(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) #define SNET_DBG(pdev, fmt, ...) dev_dbg(&(pdev)->dev, "%s"fmt, "snet_vdpa: ", ##__VA_ARGS__) #define SNET_HAS_FEATURE(s, f) ((s)->negotiated_features & BIT_ULL(f)) +/* Check if negotiated config version is at least @ver */ +#define SNET_CFG_VER(snet, ver) ((snet)->psnet->negotiated_cfg_ver >= (ver)) + /* VQ struct */ struct snet_vq { /* VQ callback */ struct vdpa_callback cb; + /* VQ state received from bus */ + struct vdpa_vq_state vq_state; /* desc base address */ u64 desc_area; /* device base address */ @@ -51,6 +56,10 @@ struct snet { struct vdpa_device vdpa; /* Config callback */ struct vdpa_callback cb; + /* To lock the control mechanism */ + struct mutex ctrl_lock; + /* Spinlock to protect critical parts in the control mechanism */ + spinlock_t ctrl_spinlock; /* array of virqueues */ struct snet_vq **vqs; /* Used features */ @@ -117,8 +126,8 @@ struct snet_cfg { u32 kick_off; /* Offset in PCI BAR for HW monitoring */ u32 hwmon_off; - /* Offset in PCI BAR for SNET messages */ - u32 msg_off; + /* Offset in PCI BAR for Control mechanism */ + u32 ctrl_off; /* Config general flags - enum snet_cfg_flags */ u32 flags; /* Reserved for future usage */ @@ -191,4 +200,9 @@ static inline void snet_write64(struct snet *snet, u32 off, u64 val) void psnet_create_hwmon(struct pci_dev *pdev); #endif +void snet_ctrl_clear(struct snet *snet); +int snet_destroy_dev(struct snet *snet); +int snet_read_vq_state(struct snet *snet, u16 idx, struct vdpa_vq_state *state); +int snet_suspend_dev(struct snet *snet); + #endif //_SNET_VDPA_H_ diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index eea23c630f7c..d343af4fa60e 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -11,8 +11,8 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> +#include <linux/kthread.h> #include <linux/slab.h> -#include <linux/sched.h> #include <linux/dma-map-ops.h> #include <linux/vringh.h> #include <linux/vdpa.h> @@ -35,10 +35,44 @@ module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, "Maximum number of iotlb entries for each address space. 0 means unlimited. (default: 2048)"); +static bool use_va = true; +module_param(use_va, bool, 0444); +MODULE_PARM_DESC(use_va, "Enable/disable the device's ability to use VA"); + #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 #define VDPASIM_VENDOR_ID 0 +struct vdpasim_mm_work { + struct kthread_work work; + struct vdpasim *vdpasim; + struct mm_struct *mm_to_bind; + int ret; +}; + +static void vdpasim_mm_work_fn(struct kthread_work *work) +{ + struct vdpasim_mm_work *mm_work = + container_of(work, struct vdpasim_mm_work, work); + struct vdpasim *vdpasim = mm_work->vdpasim; + + mm_work->ret = 0; + + //TODO: should we attach the cgroup of the mm owner? + vdpasim->mm_bound = mm_work->mm_to_bind; +} + +static void vdpasim_worker_change_mm_sync(struct vdpasim *vdpasim, + struct vdpasim_mm_work *mm_work) +{ + struct kthread_work *work = &mm_work->work; + + kthread_init_work(work, vdpasim_mm_work_fn); + kthread_queue_work(vdpasim->worker, work); + + kthread_flush_work(work); +} + static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) { return container_of(vdpa, struct vdpasim, vdpa); @@ -59,13 +93,20 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; uint16_t last_avail_idx = vq->vring.last_avail_idx; - - vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, true, - (struct vring_desc *)(uintptr_t)vq->desc_addr, - (struct vring_avail *) - (uintptr_t)vq->driver_addr, - (struct vring_used *) - (uintptr_t)vq->device_addr); + struct vring_desc *desc = (struct vring_desc *) + (uintptr_t)vq->desc_addr; + struct vring_avail *avail = (struct vring_avail *) + (uintptr_t)vq->driver_addr; + struct vring_used *used = (struct vring_used *) + (uintptr_t)vq->device_addr; + + if (use_va && vdpasim->mm_bound) { + vringh_init_iotlb_va(&vq->vring, vdpasim->features, vq->num, + true, desc, avail, used); + } else { + vringh_init_iotlb(&vq->vring, vdpasim->features, vq->num, + true, desc, avail, used); + } vq->vring.last_avail_idx = last_avail_idx; @@ -127,6 +168,25 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim) static const struct vdpa_config_ops vdpasim_config_ops; static const struct vdpa_config_ops vdpasim_batch_config_ops; +static void vdpasim_work_fn(struct kthread_work *work) +{ + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + struct mm_struct *mm = vdpasim->mm_bound; + + if (use_va && mm) { + if (!mmget_not_zero(mm)) + return; + kthread_use_mm(mm); + } + + vdpasim->dev_attr.work_fn(vdpasim); + + if (use_va && mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, const struct vdpa_dev_set_config *config) { @@ -155,7 +215,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, vdpa = __vdpa_alloc_device(NULL, ops, dev_attr->ngroups, dev_attr->nas, dev_attr->alloc_size, - dev_attr->name, false); + dev_attr->name, use_va); if (IS_ERR(vdpa)) { ret = PTR_ERR(vdpa); goto err_alloc; @@ -163,11 +223,17 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, vdpasim = vdpa_to_sim(vdpa); vdpasim->dev_attr = *dev_attr; - INIT_WORK(&vdpasim->work, dev_attr->work_fn); - spin_lock_init(&vdpasim->lock); + dev = &vdpasim->vdpa.dev; + + kthread_init_work(&vdpasim->work, vdpasim_work_fn); + vdpasim->worker = kthread_create_worker(0, "vDPA sim worker: %s", + dev_attr->name); + if (IS_ERR(vdpasim->worker)) + goto err_iommu; + + mutex_init(&vdpasim->mutex); spin_lock_init(&vdpasim->iommu_lock); - dev = &vdpasim->vdpa.dev; dev->dma_mask = &dev->coherent_dma_mask; if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) goto err_iommu; @@ -195,10 +261,6 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, for (i = 0; i < vdpasim->dev_attr.nas; i++) vhost_iotlb_init(&vdpasim->iommu[i], max_iotlb_entries, 0); - vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); - if (!vdpasim->buffer) - goto err_iommu; - for (i = 0; i < dev_attr->nvqs; i++) vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0], &vdpasim->iommu_lock); @@ -214,6 +276,12 @@ err_alloc: } EXPORT_SYMBOL_GPL(vdpasim_create); +void vdpasim_schedule_work(struct vdpasim *vdpasim) +{ + kthread_queue_work(vdpasim->worker, &vdpasim->work); +} +EXPORT_SYMBOL_GPL(vdpasim_schedule_work); + static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx, u64 desc_area, u64 driver_area, u64 device_area) @@ -248,7 +316,7 @@ static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx) } if (vq->ready) - schedule_work(&vdpasim->work); + vdpasim_schedule_work(vdpasim); } static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx, @@ -267,13 +335,13 @@ static void vdpasim_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready) struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; bool old_ready; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); old_ready = vq->ready; vq->ready = ready; if (vq->ready && !old_ready) { vdpasim_queue_ready(vdpasim, idx); } - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); } static bool vdpasim_get_vq_ready(struct vdpa_device *vdpa, u16 idx) @@ -291,9 +359,9 @@ static int vdpasim_set_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; struct vringh *vrh = &vq->vring; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vrh->last_avail_idx = state->split.avail_index; - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return 0; } @@ -390,9 +458,9 @@ static u8 vdpasim_get_status(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); u8 status; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); status = vdpasim->status; - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return status; } @@ -401,19 +469,19 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vdpasim->status = status; - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); } static int vdpasim_reset(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vdpasim->status = 0; vdpasim_do_reset(vdpasim); - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return 0; } @@ -422,9 +490,9 @@ static int vdpasim_suspend(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vdpasim->running = false; - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return 0; } @@ -434,7 +502,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); int i; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); vdpasim->running = true; if (vdpasim->pending_kick) { @@ -445,7 +513,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa) vdpasim->pending_kick = false; } - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return 0; } @@ -517,14 +585,14 @@ static int vdpasim_set_group_asid(struct vdpa_device *vdpa, unsigned int group, iommu = &vdpasim->iommu[asid]; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); for (i = 0; i < vdpasim->dev_attr.nvqs; i++) if (vdpasim_get_vq_group(vdpa, i) == group) vringh_set_iotlb(&vdpasim->vqs[i].vring, iommu, &vdpasim->iommu_lock); - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); return 0; } @@ -563,6 +631,30 @@ err: return ret; } +static int vdpasim_bind_mm(struct vdpa_device *vdpa, struct mm_struct *mm) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_mm_work mm_work; + + mm_work.vdpasim = vdpasim; + mm_work.mm_to_bind = mm; + + vdpasim_worker_change_mm_sync(vdpasim, &mm_work); + + return mm_work.ret; +} + +static void vdpasim_unbind_mm(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + struct vdpasim_mm_work mm_work; + + mm_work.vdpasim = vdpasim; + mm_work.mm_to_bind = NULL; + + vdpasim_worker_change_mm_sync(vdpasim, &mm_work); +} + static int vdpasim_dma_map(struct vdpa_device *vdpa, unsigned int asid, u64 iova, u64 size, u64 pa, u32 perm, void *opaque) @@ -610,14 +702,16 @@ static void vdpasim_free(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); int i; - cancel_work_sync(&vdpasim->work); + kthread_cancel_work_sync(&vdpasim->work); + kthread_destroy_worker(vdpasim->worker); for (i = 0; i < vdpasim->dev_attr.nvqs; i++) { vringh_kiov_cleanup(&vdpasim->vqs[i].out_iov); vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov); } - kvfree(vdpasim->buffer); + vdpasim->dev_attr.free(vdpasim); + for (i = 0; i < vdpasim->dev_attr.nas; i++) vhost_iotlb_reset(&vdpasim->iommu[i]); kfree(vdpasim->iommu); @@ -658,6 +752,8 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .set_group_asid = vdpasim_set_group_asid, .dma_map = vdpasim_dma_map, .dma_unmap = vdpasim_dma_unmap, + .bind_mm = vdpasim_bind_mm, + .unbind_mm = vdpasim_unbind_mm, .free = vdpasim_free, }; @@ -692,6 +788,8 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_iova_range = vdpasim_get_iova_range, .set_group_asid = vdpasim_set_group_asid, .set_map = vdpasim_set_map, + .bind_mm = vdpasim_bind_mm, + .unbind_mm = vdpasim_unbind_mm, .free = vdpasim_free, }; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 144858636c10..bb137e479763 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -39,33 +39,34 @@ struct vdpasim_dev_attr { u64 supported_features; size_t alloc_size; size_t config_size; - size_t buffer_size; int nvqs; u32 id; u32 ngroups; u32 nas; - work_func_t work_fn; + void (*work_fn)(struct vdpasim *vdpasim); void (*get_config)(struct vdpasim *vdpasim, void *config); void (*set_config)(struct vdpasim *vdpasim, const void *config); int (*get_stats)(struct vdpasim *vdpasim, u16 idx, struct sk_buff *msg, struct netlink_ext_ack *extack); + void (*free)(struct vdpasim *vdpasim); }; /* State of each vdpasim device */ struct vdpasim { struct vdpa_device vdpa; struct vdpasim_virtqueue *vqs; - struct work_struct work; + struct kthread_worker *worker; + struct kthread_work work; + struct mm_struct *mm_bound; struct vdpasim_dev_attr dev_attr; - /* spinlock to synchronize virtqueue state */ - spinlock_t lock; + /* mutex to synchronize virtqueue state */ + struct mutex mutex; /* virtio config according to device type */ void *config; struct vhost_iotlb *iommu; bool *iommu_pt; - void *buffer; u32 status; u32 generation; u64 features; @@ -78,6 +79,7 @@ struct vdpasim { struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr, const struct vdpa_dev_set_config *config); +void vdpasim_schedule_work(struct vdpasim *vdpasim); /* TODO: cross-endian support */ static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 5117959bed8a..00d7d72713be 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/blkdev.h> #include <linux/vringh.h> #include <linux/vdpa.h> @@ -44,8 +43,39 @@ #define VDPASIM_BLK_AS_NUM 1 #define VDPASIM_BLK_GROUP_NUM 1 +struct vdpasim_blk { + struct vdpasim vdpasim; + void *buffer; + bool shared_backend; +}; + +static struct vdpasim_blk *sim_to_blk(struct vdpasim *vdpasim) +{ + return container_of(vdpasim, struct vdpasim_blk, vdpasim); +} + static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; +static bool shared_backend; +module_param(shared_backend, bool, 0444); +MODULE_PARM_DESC(shared_backend, "Enable the shared backend between virtio-blk devices"); + +static void *shared_buffer; +/* mutex to synchronize shared_buffer access */ +static DEFINE_MUTEX(shared_buffer_mutex); + +static void vdpasim_blk_buffer_lock(struct vdpasim_blk *blk) +{ + if (blk->shared_backend) + mutex_lock(&shared_buffer_mutex); +} + +static void vdpasim_blk_buffer_unlock(struct vdpasim_blk *blk) +{ + if (blk->shared_backend) + mutex_unlock(&shared_buffer_mutex); +} + static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, u64 num_sectors, u64 max_sectors) { @@ -79,6 +109,7 @@ static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, struct vdpasim_virtqueue *vq) { + struct vdpasim_blk *blk = sim_to_blk(vdpasim); size_t pushed = 0, to_pull, to_push; struct virtio_blk_outhdr hdr; bool handled = false; @@ -144,9 +175,10 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, break; } + vdpasim_blk_buffer_lock(blk); bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, - vdpasim->buffer + offset, - to_push); + blk->buffer + offset, to_push); + vdpasim_blk_buffer_unlock(blk); if (bytes < 0) { dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", @@ -166,9 +198,10 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, break; } + vdpasim_blk_buffer_lock(blk); bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, - vdpasim->buffer + offset, - to_pull); + blk->buffer + offset, to_pull); + vdpasim_blk_buffer_unlock(blk); if (bytes < 0) { dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", @@ -248,8 +281,10 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, } if (type == VIRTIO_BLK_T_WRITE_ZEROES) { - memset(vdpasim->buffer + offset, 0, + vdpasim_blk_buffer_lock(blk); + memset(blk->buffer + offset, 0, num_sectors << SECTOR_SHIFT); + vdpasim_blk_buffer_unlock(blk); } break; @@ -286,13 +321,12 @@ err: return handled; } -static void vdpasim_blk_work(struct work_struct *work) +static void vdpasim_blk_work(struct vdpasim *vdpasim) { - struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); bool reschedule = false; int i; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; @@ -323,10 +357,10 @@ static void vdpasim_blk_work(struct work_struct *work) } } out: - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); if (reschedule) - schedule_work(&vdpasim->work); + vdpasim_schedule_work(vdpasim); } static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) @@ -355,6 +389,14 @@ static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) } +static void vdpasim_blk_free(struct vdpasim *vdpasim) +{ + struct vdpasim_blk *blk = sim_to_blk(vdpasim); + + if (!blk->shared_backend) + kvfree(blk->buffer); +} + static void vdpasim_blk_mgmtdev_release(struct device *dev) { } @@ -368,6 +410,7 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, const struct vdpa_dev_set_config *config) { struct vdpasim_dev_attr dev_attr = {}; + struct vdpasim_blk *blk; struct vdpasim *simdev; int ret; @@ -378,16 +421,30 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; dev_attr.ngroups = VDPASIM_BLK_GROUP_NUM; dev_attr.nas = VDPASIM_BLK_AS_NUM; - dev_attr.alloc_size = sizeof(struct vdpasim); + dev_attr.alloc_size = sizeof(struct vdpasim_blk); dev_attr.config_size = sizeof(struct virtio_blk_config); dev_attr.get_config = vdpasim_blk_get_config; dev_attr.work_fn = vdpasim_blk_work; - dev_attr.buffer_size = VDPASIM_BLK_CAPACITY << SECTOR_SHIFT; + dev_attr.free = vdpasim_blk_free; simdev = vdpasim_create(&dev_attr, config); if (IS_ERR(simdev)) return PTR_ERR(simdev); + blk = sim_to_blk(simdev); + blk->shared_backend = shared_backend; + + if (blk->shared_backend) { + blk->buffer = shared_buffer; + } else { + blk->buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + GFP_KERNEL); + if (!blk->buffer) { + ret = -ENOMEM; + goto put_dev; + } + } + ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_BLK_VQ_NUM); if (ret) goto put_dev; @@ -437,6 +494,15 @@ static int __init vdpasim_blk_init(void) if (ret) goto parent_err; + if (shared_backend) { + shared_buffer = kvmalloc(VDPASIM_BLK_CAPACITY << SECTOR_SHIFT, + GFP_KERNEL); + if (!shared_buffer) { + ret = -ENOMEM; + goto parent_err; + } + } + return 0; parent_err: @@ -446,6 +512,7 @@ parent_err: static void __exit vdpasim_blk_exit(void) { + kvfree(shared_buffer); vdpa_mgmtdev_unregister(&mgmt_dev); device_unregister(&vdpasim_blk_mgmtdev); } diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index dfe2ce341803..cfe962911804 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/device.h> #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/etherdevice.h> #include <linux/vringh.h> #include <linux/vdpa.h> @@ -59,6 +58,7 @@ struct vdpasim_net{ struct vdpasim_dataq_stats tx_stats; struct vdpasim_dataq_stats rx_stats; struct vdpasim_cq_stats cq_stats; + void *buffer; }; static struct vdpasim_net *sim_to_net(struct vdpasim *vdpasim) @@ -88,14 +88,15 @@ static bool receive_filter(struct vdpasim *vdpasim, size_t len) size_t hdr_len = modern ? sizeof(struct virtio_net_hdr_v1) : sizeof(struct virtio_net_hdr); struct virtio_net_config *vio_config = vdpasim->config; + struct vdpasim_net *net = sim_to_net(vdpasim); if (len < ETH_ALEN + hdr_len) return false; - if (is_broadcast_ether_addr(vdpasim->buffer + hdr_len) || - is_multicast_ether_addr(vdpasim->buffer + hdr_len)) + if (is_broadcast_ether_addr(net->buffer + hdr_len) || + is_multicast_ether_addr(net->buffer + hdr_len)) return true; - if (!strncmp(vdpasim->buffer + hdr_len, vio_config->mac, ETH_ALEN)) + if (!strncmp(net->buffer + hdr_len, vio_config->mac, ETH_ALEN)) return true; return false; @@ -192,9 +193,8 @@ static void vdpasim_handle_cvq(struct vdpasim *vdpasim) u64_stats_update_end(&net->cq_stats.syncp); } -static void vdpasim_net_work(struct work_struct *work) +static void vdpasim_net_work(struct vdpasim *vdpasim) { - struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); struct vdpasim_virtqueue *txq = &vdpasim->vqs[1]; struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0]; struct vdpasim_net *net = sim_to_net(vdpasim); @@ -203,7 +203,7 @@ static void vdpasim_net_work(struct work_struct *work) u64 rx_drops = 0, rx_overruns = 0, rx_errors = 0, tx_errors = 0; int err; - spin_lock(&vdpasim->lock); + mutex_lock(&vdpasim->mutex); if (!vdpasim->running) goto out; @@ -227,8 +227,7 @@ static void vdpasim_net_work(struct work_struct *work) ++tx_pkts; read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov, - vdpasim->buffer, - PAGE_SIZE); + net->buffer, PAGE_SIZE); tx_bytes += read; @@ -247,7 +246,7 @@ static void vdpasim_net_work(struct work_struct *work) } write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov, - vdpasim->buffer, read); + net->buffer, read); if (write <= 0) { ++rx_errors; break; @@ -260,13 +259,13 @@ static void vdpasim_net_work(struct work_struct *work) vdpasim_net_complete(rxq, write); if (tx_pkts > 4) { - schedule_work(&vdpasim->work); + vdpasim_schedule_work(vdpasim); goto out; } } out: - spin_unlock(&vdpasim->lock); + mutex_unlock(&vdpasim->mutex); u64_stats_update_begin(&net->tx_stats.syncp); net->tx_stats.pkts += tx_pkts; @@ -429,6 +428,13 @@ static void vdpasim_net_setup_config(struct vdpasim *vdpasim, vio_config->mtu = cpu_to_vdpasim16(vdpasim, 1500); } +static void vdpasim_net_free(struct vdpasim *vdpasim) +{ + struct vdpasim_net *net = sim_to_net(vdpasim); + + kvfree(net->buffer); +} + static void vdpasim_net_mgmtdev_release(struct device *dev) { } @@ -458,7 +464,7 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, dev_attr.get_config = vdpasim_net_get_config; dev_attr.work_fn = vdpasim_net_work; dev_attr.get_stats = vdpasim_net_get_stats; - dev_attr.buffer_size = PAGE_SIZE; + dev_attr.free = vdpasim_net_free; simdev = vdpasim_create(&dev_attr, config); if (IS_ERR(simdev)) @@ -472,6 +478,12 @@ static int vdpasim_net_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, u64_stats_init(&net->rx_stats.syncp); u64_stats_init(&net->cq_stats.syncp); + net->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL); + if (!net->buffer) { + ret = -ENOMEM; + goto reg_err; + } + /* * Initialization must be completed before this call, since it can * connect the device to the vDPA bus, so requests can arrive after diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index c421b83f6fa2..de97e38c3b82 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -37,10 +37,15 @@ #define DRV_LICENSE "GPL v2" #define VDUSE_DEV_MAX (1U << MINORBITS) +#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) +#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) -#define VDUSE_IOVA_SIZE (128 * 1024 * 1024) +/* 128 MB reserved for virtqueue creation */ +#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024) #define VDUSE_MSG_DEFAULT_TIMEOUT 30 +#define IRQ_UNBOUND -1 + struct vduse_virtqueue { u16 index; u16 num_max; @@ -57,6 +62,9 @@ struct vduse_virtqueue { struct vdpa_callback cb; struct work_struct inject; struct work_struct kick; + int irq_effective_cpu; + struct cpumask irq_affinity; + struct kobject kobj; }; struct vduse_dev; @@ -76,7 +84,7 @@ struct vduse_umem { struct vduse_dev { struct vduse_vdpa *vdev; struct device *dev; - struct vduse_virtqueue *vqs; + struct vduse_virtqueue **vqs; struct vduse_iova_domain *domain; char *name; struct mutex lock; @@ -106,6 +114,8 @@ struct vduse_dev { u32 vq_align; struct vduse_umem *umem; struct mutex mem_lock; + unsigned int bounce_size; + struct mutex domain_lock; }; struct vduse_dev_msg { @@ -128,6 +138,7 @@ static struct class *vduse_class; static struct cdev vduse_ctrl_cdev; static struct cdev vduse_cdev; static struct workqueue_struct *vduse_irq_wq; +static struct workqueue_struct *vduse_irq_bound_wq; static u32 allowed_device_id[] = { VIRTIO_ID_BLOCK, @@ -419,7 +430,7 @@ static void vduse_dev_reset(struct vduse_dev *dev) struct vduse_iova_domain *domain = dev->domain; /* The coherent mappings are handled in vduse_dev_free_coherent() */ - if (domain->bounce_map) + if (domain && domain->bounce_map) vduse_domain_reset_bounce_map(domain); down_write(&dev->rwsem); @@ -434,7 +445,7 @@ static void vduse_dev_reset(struct vduse_dev *dev) flush_work(&dev->inject); for (i = 0; i < dev->vq_num; i++) { - struct vduse_virtqueue *vq = &dev->vqs[i]; + struct vduse_virtqueue *vq = dev->vqs[i]; vq->ready = false; vq->desc_addr = 0; @@ -453,6 +464,7 @@ static void vduse_dev_reset(struct vduse_dev *dev) spin_lock(&vq->irq_lock); vq->cb.callback = NULL; vq->cb.private = NULL; + vq->cb.trigger = NULL; spin_unlock(&vq->irq_lock); flush_work(&vq->inject); flush_work(&vq->kick); @@ -466,7 +478,7 @@ static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx, u64 device_area) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; vq->desc_addr = desc_area; vq->driver_addr = driver_area; @@ -500,7 +512,7 @@ static void vduse_vq_kick_work(struct work_struct *work) static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; if (!eventfd_signal_allowed()) { schedule_work(&vq->kick); @@ -513,18 +525,19 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx, struct vdpa_callback *cb) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; spin_lock(&vq->irq_lock); vq->cb.callback = cb->callback; vq->cb.private = cb->private; + vq->cb.trigger = cb->trigger; spin_unlock(&vq->irq_lock); } static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; vq->num = num; } @@ -533,7 +546,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa, u16 idx, bool ready) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; vq->ready = ready; } @@ -541,7 +554,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa, static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; return vq->ready; } @@ -550,7 +563,7 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, const struct vdpa_vq_state *state) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { vq->state.packed.last_avail_counter = @@ -569,7 +582,7 @@ static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpa_vq_state *state) { struct vduse_dev *dev = vdpa_to_vduse(vdpa); - struct vduse_virtqueue *vq = &dev->vqs[idx]; + struct vduse_virtqueue *vq = dev->vqs[idx]; if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) return vduse_dev_get_vq_state_packed(dev, vq, &state->packed); @@ -624,8 +637,8 @@ static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa) int i; for (i = 0; i < dev->vq_num; i++) - if (num_max < dev->vqs[i].num_max) - num_max = dev->vqs[i].num_max; + if (num_max < dev->vqs[i]->num_max) + num_max = dev->vqs[i]->num_max; return num_max; } @@ -708,6 +721,23 @@ static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa) return dev->generation; } +static int vduse_vdpa_set_vq_affinity(struct vdpa_device *vdpa, u16 idx, + const struct cpumask *cpu_mask) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + cpumask_copy(&dev->vqs[idx]->irq_affinity, cpu_mask); + return 0; +} + +static const struct cpumask * +vduse_vdpa_get_vq_affinity(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return &dev->vqs[idx]->irq_affinity; +} + static int vduse_vdpa_set_map(struct vdpa_device *vdpa, unsigned int asid, struct vhost_iotlb *iotlb) @@ -758,6 +788,8 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .get_config = vduse_vdpa_get_config, .set_config = vduse_vdpa_set_config, .get_generation = vduse_vdpa_get_generation, + .set_vq_affinity = vduse_vdpa_set_vq_affinity, + .get_vq_affinity = vduse_vdpa_get_vq_affinity, .reset = vduse_vdpa_reset, .set_map = vduse_vdpa_set_map, .free = vduse_vdpa_free, @@ -863,7 +895,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev, return -EINVAL; index = array_index_nospec(eventfd->index, dev->vq_num); - vq = &dev->vqs[index]; + vq = dev->vqs[index]; if (eventfd->fd >= 0) { ctx = eventfd_ctx_fdget(eventfd->fd); if (IS_ERR(ctx)) @@ -889,7 +921,7 @@ static bool vduse_dev_is_ready(struct vduse_dev *dev) int i; for (i = 0; i < dev->vq_num; i++) - if (!dev->vqs[i].num_max) + if (!dev->vqs[i]->num_max) return false; return true; @@ -916,8 +948,26 @@ static void vduse_vq_irq_inject(struct work_struct *work) spin_unlock_irq(&vq->irq_lock); } +static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq) +{ + bool signal = false; + + if (!vq->cb.trigger) + return false; + + spin_lock_irq(&vq->irq_lock); + if (vq->ready && vq->cb.trigger) { + eventfd_signal(vq->cb.trigger, 1); + signal = true; + } + spin_unlock_irq(&vq->irq_lock); + + return signal; +} + static int vduse_dev_queue_irq_work(struct vduse_dev *dev, - struct work_struct *irq_work) + struct work_struct *irq_work, + int irq_effective_cpu) { int ret = -EINVAL; @@ -926,7 +976,11 @@ static int vduse_dev_queue_irq_work(struct vduse_dev *dev, goto unlock; ret = 0; - queue_work(vduse_irq_wq, irq_work); + if (irq_effective_cpu == IRQ_UNBOUND) + queue_work(vduse_irq_wq, irq_work); + else + queue_work_on(irq_effective_cpu, + vduse_irq_bound_wq, irq_work); unlock: up_read(&dev->rwsem); @@ -944,6 +998,9 @@ static int vduse_dev_dereg_umem(struct vduse_dev *dev, goto unlock; ret = -EINVAL; + if (!dev->domain) + goto unlock; + if (dev->umem->iova != iova || size != dev->domain->bounce_size) goto unlock; @@ -970,7 +1027,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, unsigned long npages, lock_limit; int ret; - if (!dev->domain->bounce_map || + if (!dev->domain || !dev->domain->bounce_map || size != dev->domain->bounce_size || iova != 0 || uaddr & ~PAGE_MASK) return -EINVAL; @@ -1029,6 +1086,22 @@ unlock: return ret; } +static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq) +{ + int curr_cpu = vq->irq_effective_cpu; + + while (true) { + curr_cpu = cpumask_next(curr_cpu, &vq->irq_affinity); + if (cpu_online(curr_cpu)) + break; + + if (curr_cpu >= nr_cpu_ids) + curr_cpu = IRQ_UNBOUND; + } + + vq->irq_effective_cpu = curr_cpu; +} + static long vduse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1044,7 +1117,6 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, struct vduse_iotlb_entry entry; struct vhost_iotlb_map *map; struct vdpa_map_file *map_file; - struct vduse_iova_domain *domain = dev->domain; struct file *f = NULL; ret = -EFAULT; @@ -1055,8 +1127,13 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (entry.start > entry.last) break; - spin_lock(&domain->iotlb_lock); - map = vhost_iotlb_itree_first(domain->iotlb, + mutex_lock(&dev->domain_lock); + if (!dev->domain) { + mutex_unlock(&dev->domain_lock); + break; + } + spin_lock(&dev->domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->domain->iotlb, entry.start, entry.last); if (map) { map_file = (struct vdpa_map_file *)map->opaque; @@ -1066,7 +1143,8 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, entry.last = map->last; entry.perm = map->perm; } - spin_unlock(&domain->iotlb_lock); + spin_unlock(&dev->domain->iotlb_lock); + mutex_unlock(&dev->domain_lock); ret = -EINVAL; if (!f) break; @@ -1111,7 +1189,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; } case VDUSE_DEV_INJECT_CONFIG_IRQ: - ret = vduse_dev_queue_irq_work(dev, &dev->inject); + ret = vduse_dev_queue_irq_work(dev, &dev->inject, IRQ_UNBOUND); break; case VDUSE_VQ_SETUP: { struct vduse_vq_config config; @@ -1130,7 +1208,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; index = array_index_nospec(config.index, dev->vq_num); - dev->vqs[index].num_max = config.max_size; + dev->vqs[index]->num_max = config.max_size; ret = 0; break; } @@ -1148,7 +1226,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, break; index = array_index_nospec(vq_info.index, dev->vq_num); - vq = &dev->vqs[index]; + vq = dev->vqs[index]; vq_info.desc_addr = vq->desc_addr; vq_info.driver_addr = vq->driver_addr; vq_info.device_addr = vq->device_addr; @@ -1197,8 +1275,14 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (index >= dev->vq_num) break; + ret = 0; index = array_index_nospec(index, dev->vq_num); - ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject); + if (!vduse_vq_signal_irqfd(dev->vqs[index])) { + vduse_vq_update_effective_cpu(dev->vqs[index]); + ret = vduse_dev_queue_irq_work(dev, + &dev->vqs[index]->inject, + dev->vqs[index]->irq_effective_cpu); + } break; } case VDUSE_IOTLB_REG_UMEM: { @@ -1213,8 +1297,10 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, sizeof(umem.reserved))) break; + mutex_lock(&dev->domain_lock); ret = vduse_dev_reg_umem(dev, umem.iova, umem.uaddr, umem.size); + mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_DEREG_UMEM: { @@ -1228,15 +1314,15 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (!is_mem_zero((const char *)umem.reserved, sizeof(umem.reserved))) break; - + mutex_lock(&dev->domain_lock); ret = vduse_dev_dereg_umem(dev, umem.iova, umem.size); + mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_GET_INFO: { struct vduse_iova_info info; struct vhost_iotlb_map *map; - struct vduse_iova_domain *domain = dev->domain; ret = -EFAULT; if (copy_from_user(&info, argp, sizeof(info))) @@ -1250,18 +1336,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, sizeof(info.reserved))) break; - spin_lock(&domain->iotlb_lock); - map = vhost_iotlb_itree_first(domain->iotlb, + mutex_lock(&dev->domain_lock); + if (!dev->domain) { + mutex_unlock(&dev->domain_lock); + break; + } + spin_lock(&dev->domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->domain->iotlb, info.start, info.last); if (map) { info.start = map->start; info.last = map->last; info.capability = 0; - if (domain->bounce_map && map->start == 0 && - map->last == domain->bounce_size - 1) + if (dev->domain->bounce_map && map->start == 0 && + map->last == dev->domain->bounce_size - 1) info.capability |= VDUSE_IOVA_CAP_UMEM; } - spin_unlock(&domain->iotlb_lock); + spin_unlock(&dev->domain->iotlb_lock); + mutex_unlock(&dev->domain_lock); if (!map) break; @@ -1284,7 +1376,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file) { struct vduse_dev *dev = file->private_data; - vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); + mutex_lock(&dev->domain_lock); + if (dev->domain) + vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); + mutex_unlock(&dev->domain_lock); spin_lock(&dev->msg_lock); /* Make sure the inflight messages can processed after reconncection */ list_splice_init(&dev->recv_list, &dev->send_list); @@ -1339,6 +1434,151 @@ static const struct file_operations vduse_dev_fops = { .llseek = noop_llseek, }; +static ssize_t irq_cb_affinity_show(struct vduse_virtqueue *vq, char *buf) +{ + return sprintf(buf, "%*pb\n", cpumask_pr_args(&vq->irq_affinity)); +} + +static ssize_t irq_cb_affinity_store(struct vduse_virtqueue *vq, + const char *buf, size_t count) +{ + cpumask_var_t new_value; + int ret; + + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + ret = cpumask_parse(buf, new_value); + if (ret) + goto free_mask; + + ret = -EINVAL; + if (!cpumask_intersects(new_value, cpu_online_mask)) + goto free_mask; + + cpumask_copy(&vq->irq_affinity, new_value); + ret = count; +free_mask: + free_cpumask_var(new_value); + return ret; +} + +struct vq_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct vduse_virtqueue *vq, char *buf); + ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf, + size_t count); +}; + +static struct vq_sysfs_entry irq_cb_affinity_attr = __ATTR_RW(irq_cb_affinity); + +static struct attribute *vq_attrs[] = { + &irq_cb_affinity_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(vq); + +static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + struct vq_sysfs_entry *entry = container_of(attr, + struct vq_sysfs_entry, attr); + + if (!entry->show) + return -EIO; + + return entry->show(vq, buf); +} + +static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + struct vq_sysfs_entry *entry = container_of(attr, + struct vq_sysfs_entry, attr); + + if (!entry->store) + return -EIO; + + return entry->store(vq, buf, count); +} + +static const struct sysfs_ops vq_sysfs_ops = { + .show = vq_attr_show, + .store = vq_attr_store, +}; + +static void vq_release(struct kobject *kobj) +{ + struct vduse_virtqueue *vq = container_of(kobj, + struct vduse_virtqueue, kobj); + kfree(vq); +} + +static const struct kobj_type vq_type = { + .release = vq_release, + .sysfs_ops = &vq_sysfs_ops, + .default_groups = vq_groups, +}; + +static void vduse_dev_deinit_vqs(struct vduse_dev *dev) +{ + int i; + + if (!dev->vqs) + return; + + for (i = 0; i < dev->vq_num; i++) + kobject_put(&dev->vqs[i]->kobj); + kfree(dev->vqs); +} + +static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num) +{ + int ret, i; + + dev->vq_align = vq_align; + dev->vq_num = vq_num; + dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL); + if (!dev->vqs) + return -ENOMEM; + + for (i = 0; i < vq_num; i++) { + dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL); + if (!dev->vqs[i]) { + ret = -ENOMEM; + goto err; + } + + dev->vqs[i]->index = i; + dev->vqs[i]->irq_effective_cpu = IRQ_UNBOUND; + INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject); + INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work); + spin_lock_init(&dev->vqs[i]->kick_lock); + spin_lock_init(&dev->vqs[i]->irq_lock); + cpumask_setall(&dev->vqs[i]->irq_affinity); + + kobject_init(&dev->vqs[i]->kobj, &vq_type); + ret = kobject_add(&dev->vqs[i]->kobj, + &dev->dev->kobj, "vq%d", i); + if (ret) { + kfree(dev->vqs[i]); + goto err; + } + } + + return 0; +err: + while (i--) + kobject_put(&dev->vqs[i]->kobj); + kfree(dev->vqs); + dev->vqs = NULL; + return ret; +} + static struct vduse_dev *vduse_dev_create(void) { struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL); @@ -1348,6 +1588,7 @@ static struct vduse_dev *vduse_dev_create(void) mutex_init(&dev->lock); mutex_init(&dev->mem_lock); + mutex_init(&dev->domain_lock); spin_lock_init(&dev->msg_lock); INIT_LIST_HEAD(&dev->send_list); INIT_LIST_HEAD(&dev->recv_list); @@ -1396,8 +1637,9 @@ static int vduse_destroy_dev(char *name) device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); idr_remove(&vduse_idr, dev->minor); kvfree(dev->config); - kfree(dev->vqs); - vduse_domain_destroy(dev->domain); + vduse_dev_deinit_vqs(dev); + if (dev->domain) + vduse_domain_destroy(dev->domain); kfree(dev->name); vduse_dev_destroy(dev); module_put(THIS_MODULE); @@ -1476,8 +1718,48 @@ static ssize_t msg_timeout_store(struct device *device, static DEVICE_ATTR_RW(msg_timeout); +static ssize_t bounce_size_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + + return sysfs_emit(buf, "%u\n", dev->bounce_size); +} + +static ssize_t bounce_size_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + unsigned int bounce_size; + int ret; + + ret = -EPERM; + mutex_lock(&dev->domain_lock); + if (dev->domain) + goto unlock; + + ret = kstrtouint(buf, 10, &bounce_size); + if (ret < 0) + goto unlock; + + ret = -EINVAL; + if (bounce_size > VDUSE_MAX_BOUNCE_SIZE || + bounce_size < VDUSE_MIN_BOUNCE_SIZE) + goto unlock; + + dev->bounce_size = bounce_size & PAGE_MASK; + ret = count; +unlock: + mutex_unlock(&dev->domain_lock); + return ret; +} + +static DEVICE_ATTR_RW(bounce_size); + static struct attribute *vduse_dev_attrs[] = { &dev_attr_msg_timeout.attr, + &dev_attr_bounce_size.attr, NULL }; @@ -1486,7 +1768,7 @@ ATTRIBUTE_GROUPS(vduse_dev); static int vduse_create_dev(struct vduse_dev_config *config, void *config_buf, u64 api_version) { - int i, ret; + int ret; struct vduse_dev *dev; ret = -EEXIST; @@ -1506,26 +1788,9 @@ static int vduse_create_dev(struct vduse_dev_config *config, if (!dev->name) goto err_str; - dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, - VDUSE_BOUNCE_SIZE); - if (!dev->domain) - goto err_domain; - + dev->bounce_size = VDUSE_BOUNCE_SIZE; dev->config = config_buf; dev->config_size = config->config_size; - dev->vq_align = config->vq_align; - dev->vq_num = config->vq_num; - dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL); - if (!dev->vqs) - goto err_vqs; - - for (i = 0; i < dev->vq_num; i++) { - dev->vqs[i].index = i; - INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject); - INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work); - spin_lock_init(&dev->vqs[i].kick_lock); - spin_lock_init(&dev->vqs[i].irq_lock); - } ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL); if (ret < 0) @@ -1540,16 +1805,19 @@ static int vduse_create_dev(struct vduse_dev_config *config, ret = PTR_ERR(dev->dev); goto err_dev; } + + ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num); + if (ret) + goto err_vqs; + __module_get(THIS_MODULE); return 0; +err_vqs: + device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); err_dev: idr_remove(&vduse_idr, dev->minor); err_idr: - kfree(dev->vqs); -err_vqs: - vduse_domain_destroy(dev->domain); -err_domain: kfree(dev->name); err_str: vduse_dev_destroy(dev); @@ -1716,9 +1984,23 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, if (ret) return ret; + mutex_lock(&dev->domain_lock); + if (!dev->domain) + dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, + dev->bounce_size); + mutex_unlock(&dev->domain_lock); + if (!dev->domain) { + put_device(&dev->vdev->vdpa.dev); + return -ENOMEM; + } + ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); if (ret) { put_device(&dev->vdev->vdpa.dev); + mutex_lock(&dev->domain_lock); + vduse_domain_destroy(dev->domain); + dev->domain = NULL; + mutex_unlock(&dev->domain_lock); return ret; } @@ -1824,12 +2106,15 @@ static int vduse_init(void) if (ret) goto err_cdev; + ret = -ENOMEM; vduse_irq_wq = alloc_workqueue("vduse-irq", WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0); - if (!vduse_irq_wq) { - ret = -ENOMEM; + if (!vduse_irq_wq) goto err_wq; - } + + vduse_irq_bound_wq = alloc_workqueue("vduse-irq-bound", WQ_HIGHPRI, 0); + if (!vduse_irq_bound_wq) + goto err_bound_wq; ret = vduse_domain_init(); if (ret) @@ -1843,6 +2128,8 @@ static int vduse_init(void) err_mgmtdev: vduse_domain_exit(); err_domain: + destroy_workqueue(vduse_irq_bound_wq); +err_bound_wq: destroy_workqueue(vduse_irq_wq); err_wq: cdev_del(&vduse_cdev); @@ -1862,6 +2149,7 @@ static void vduse_exit(void) { vduse_mgmtdev_exit(); vduse_domain_exit(); + destroy_workqueue(vduse_irq_bound_wq); destroy_workqueue(vduse_irq_wq); cdev_del(&vduse_cdev); device_destroy(vduse_class, vduse_major); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 139ad52d0aa1..bb10fa4bb4f6 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -229,7 +229,10 @@ struct vhost_scsi_ctx { struct iov_iter out_iter; }; -/* Global spinlock to protect vhost_scsi TPG list for vhost IOCTL access */ +/* + * Global mutex to protect vhost_scsi TPG list for vhost IOCTLs and LIO + * configfs management operations. + */ static DEFINE_MUTEX(vhost_scsi_mutex); static LIST_HEAD(vhost_scsi_list); @@ -1501,7 +1504,7 @@ out: * vhost_scsi_tpg with an active struct vhost_scsi_nexus * * The lock nesting rule is: - * vhost_scsi_mutex -> vs->dev.mutex -> tpg->tv_tpg_mutex -> vq->mutex + * vs->dev.mutex -> vhost_scsi_mutex -> tpg->tv_tpg_mutex -> vq->mutex */ static int vhost_scsi_set_endpoint(struct vhost_scsi *vs, @@ -1515,7 +1518,6 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, int index, ret, i, len; bool match = false; - mutex_lock(&vhost_scsi_mutex); mutex_lock(&vs->dev.mutex); /* Verify that ring has been setup correctly. */ @@ -1536,6 +1538,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (vs->vs_tpg) memcpy(vs_tpg, vs->vs_tpg, len); + mutex_lock(&vhost_scsi_mutex); list_for_each_entry(tpg, &vhost_scsi_list, tv_tpg_list) { mutex_lock(&tpg->tv_tpg_mutex); if (!tpg->tpg_nexus) { @@ -1551,6 +1554,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (!strcmp(tv_tport->tport_name, t->vhost_wwpn)) { if (vs->vs_tpg && vs->vs_tpg[tpg->tport_tpgt]) { mutex_unlock(&tpg->tv_tpg_mutex); + mutex_unlock(&vhost_scsi_mutex); ret = -EEXIST; goto undepend; } @@ -1565,6 +1569,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (ret) { pr_warn("target_depend_item() failed: %d\n", ret); mutex_unlock(&tpg->tv_tpg_mutex); + mutex_unlock(&vhost_scsi_mutex); goto undepend; } tpg->tv_tpg_vhost_count++; @@ -1574,6 +1579,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, } mutex_unlock(&tpg->tv_tpg_mutex); } + mutex_unlock(&vhost_scsi_mutex); if (match) { memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn, @@ -1629,7 +1635,6 @@ undepend: kfree(vs_tpg); out: mutex_unlock(&vs->dev.mutex); - mutex_unlock(&vhost_scsi_mutex); return ret; } @@ -1645,7 +1650,6 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, int index, ret, i; u8 target; - mutex_lock(&vhost_scsi_mutex); mutex_lock(&vs->dev.mutex); /* Verify that ring has been setup correctly. */ for (index = 0; index < vs->dev.nvqs; ++index) { @@ -1666,11 +1670,10 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, if (!tpg) continue; - mutex_lock(&tpg->tv_tpg_mutex); tv_tport = tpg->tport; if (!tv_tport) { ret = -ENODEV; - goto err_tpg; + goto err_dev; } if (strcmp(tv_tport->tport_name, t->vhost_wwpn)) { @@ -1679,35 +1682,51 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, tv_tport->tport_name, tpg->tport_tpgt, t->vhost_wwpn, t->vhost_tpgt); ret = -EINVAL; - goto err_tpg; + goto err_dev; } + match = true; + } + if (!match) + goto free_vs_tpg; + + /* Prevent new cmds from starting and accessing the tpgs/sessions */ + for (i = 0; i < vs->dev.nvqs; i++) { + vq = &vs->vqs[i].vq; + mutex_lock(&vq->mutex); + vhost_vq_set_backend(vq, NULL); + mutex_unlock(&vq->mutex); + } + /* Make sure cmds are not running before tearing them down. */ + vhost_scsi_flush(vs); + + for (i = 0; i < vs->dev.nvqs; i++) { + vq = &vs->vqs[i].vq; + vhost_scsi_destroy_vq_cmds(vq); + } + + /* + * We can now release our hold on the tpg and sessions and userspace + * can free them after this point. + */ + for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) { + target = i; + tpg = vs->vs_tpg[target]; + if (!tpg) + continue; + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_vhost_count--; tpg->vhost_scsi = NULL; vs->vs_tpg[target] = NULL; - match = true; + mutex_unlock(&tpg->tv_tpg_mutex); - /* - * Release se_tpg->tpg_group.cg_item configfs dependency now - * to allow vhost-scsi WWPN se_tpg->tpg_group shutdown to occur. - */ + se_tpg = &tpg->se_tpg; target_undepend_item(&se_tpg->tpg_group.cg_item); } - if (match) { - for (i = 0; i < vs->dev.nvqs; i++) { - vq = &vs->vqs[i].vq; - mutex_lock(&vq->mutex); - vhost_vq_set_backend(vq, NULL); - mutex_unlock(&vq->mutex); - } - /* Make sure cmds are not running before tearing them down. */ - vhost_scsi_flush(vs); - for (i = 0; i < vs->dev.nvqs; i++) { - vq = &vs->vqs[i].vq; - vhost_scsi_destroy_vq_cmds(vq); - } - } +free_vs_tpg: /* * Act as synchronize_rcu to make sure access to * old vs->vs_tpg is finished. @@ -1717,14 +1736,10 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, vs->vs_tpg = NULL; WARN_ON(vs->vs_events_nr); mutex_unlock(&vs->dev.mutex); - mutex_unlock(&vhost_scsi_mutex); return 0; -err_tpg: - mutex_unlock(&tpg->tv_tpg_mutex); err_dev: mutex_unlock(&vs->dev.mutex); - mutex_unlock(&vhost_scsi_mutex); return ret; } @@ -1965,8 +1980,6 @@ vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg, if (!vs) return; - mutex_lock(&vs->dev.mutex); - if (plug) reason = VIRTIO_SCSI_EVT_RESET_RESCAN; else @@ -1974,11 +1987,18 @@ vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg, vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; mutex_lock(&vq->mutex); + /* + * We can't queue events if the backend has been cleared, because + * we could end up queueing an event after the flush. + */ + if (!vhost_vq_get_backend(vq)) + goto unlock; + if (vhost_has_feature(vq, VIRTIO_SCSI_F_HOTPLUG)) vhost_scsi_send_evt(vs, tpg, lun, VIRTIO_SCSI_T_TRANSPORT_RESET, reason); +unlock: mutex_unlock(&vq->mutex); - mutex_unlock(&vs->dev.mutex); } static void vhost_scsi_hotplug(struct vhost_scsi_tpg *tpg, struct se_lun *lun) @@ -1997,15 +2017,10 @@ static int vhost_scsi_port_link(struct se_portal_group *se_tpg, struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, se_tpg); - mutex_lock(&vhost_scsi_mutex); - mutex_lock(&tpg->tv_tpg_mutex); tpg->tv_tpg_port_count++; - mutex_unlock(&tpg->tv_tpg_mutex); - vhost_scsi_hotplug(tpg, lun); - - mutex_unlock(&vhost_scsi_mutex); + mutex_unlock(&tpg->tv_tpg_mutex); return 0; } @@ -2016,15 +2031,10 @@ static void vhost_scsi_port_unlink(struct se_portal_group *se_tpg, struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, se_tpg); - mutex_lock(&vhost_scsi_mutex); - mutex_lock(&tpg->tv_tpg_mutex); tpg->tv_tpg_port_count--; - mutex_unlock(&tpg->tv_tpg_mutex); - vhost_scsi_hotunplug(tpg, lun); - - mutex_unlock(&vhost_scsi_mutex); + mutex_unlock(&tpg->tv_tpg_mutex); } static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_store( diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 135ad39958cc..8c1aefc865f0 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -219,6 +219,28 @@ static int vhost_vdpa_reset(struct vhost_vdpa *v) return vdpa_reset(vdpa); } +static long vhost_vdpa_bind_mm(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (!vdpa->use_va || !ops->bind_mm) + return 0; + + return ops->bind_mm(vdpa, v->vdev.mm); +} + +static void vhost_vdpa_unbind_mm(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (!vdpa->use_va || !ops->unbind_mm) + return; + + ops->unbind_mm(vdpa); +} + static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) { struct vdpa_device *vdpa = v->vdpa; @@ -599,9 +621,11 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, if (vq->call_ctx.ctx) { cb.callback = vhost_vdpa_virtqueue_cb; cb.private = vq; + cb.trigger = vq->call_ctx.ctx; } else { cb.callback = NULL; cb.private = NULL; + cb.trigger = NULL; } ops->set_vq_cb(vdpa, idx, &cb); vhost_vdpa_setup_vq_irq(v, idx); @@ -716,6 +740,17 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, break; } + if (r) + goto out; + + switch (cmd) { + case VHOST_SET_OWNER: + r = vhost_vdpa_bind_mm(v); + if (r) + vhost_dev_reset_owner(d, NULL); + break; + } +out: mutex_unlock(&d->mutex); return r; } @@ -851,11 +886,7 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, if (!v->in_batch) ops->set_map(vdpa, asid, iotlb); } - /* If we are in the middle of batch processing, delay the free - * of AS until BATCH_END. - */ - if (!v->in_batch && !iotlb->nmaps) - vhost_vdpa_remove_as(v, asid); + } static int vhost_vdpa_va_map(struct vhost_vdpa *v, @@ -1112,8 +1143,6 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, u32 asid, if (v->in_batch && ops->set_map) ops->set_map(vdpa, asid, iotlb); v->in_batch = false; - if (!iotlb->nmaps) - vhost_vdpa_remove_as(v, asid); break; default: r = -EINVAL; @@ -1287,6 +1316,7 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep) vhost_vdpa_clean_irq(v); vhost_vdpa_reset(v); vhost_dev_stop(&v->vdev); + vhost_vdpa_unbind_mm(v); vhost_vdpa_config_put(v); vhost_vdpa_cleanup(v); mutex_unlock(&d->mutex); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 6d07b42833be..10bf35a3db6e 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -434,8 +434,7 @@ static size_t vhost_get_avail_size(struct vhost_virtqueue *vq, size_t event __maybe_unused = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; - return sizeof(*vq->avail) + - sizeof(*vq->avail->ring) * num + event; + return size_add(struct_size(vq->avail, ring, num), event); } static size_t vhost_get_used_size(struct vhost_virtqueue *vq, @@ -444,8 +443,7 @@ static size_t vhost_get_used_size(struct vhost_virtqueue *vq, size_t event __maybe_unused = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; - return sizeof(*vq->used) + - sizeof(*vq->used->ring) * num + event; + return size_add(struct_size(vq->used, ring, num), event); } static size_t vhost_get_desc_size(struct vhost_virtqueue *vq, diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index a1e27da54481..955d938eb663 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -636,9 +636,9 @@ static inline int xfer_to_user(const struct vringh *vrh, * @features: the feature bits for this ring. * @num: the number of elements. * @weak_barriers: true if we only need memory barriers, not I/O. - * @desc: the userpace descriptor pointer. - * @avail: the userpace avail pointer. - * @used: the userpace used pointer. + * @desc: the userspace descriptor pointer. + * @avail: the userspace avail pointer. + * @used: the userspace used pointer. * * Returns an error if num is invalid: you should check pointers * yourself! @@ -911,9 +911,9 @@ static inline int kern_xfer(const struct vringh *vrh, void *dst, * @features: the feature bits for this ring. * @num: the number of elements. * @weak_barriers: true if we only need memory barriers, not I/O. - * @desc: the userpace descriptor pointer. - * @avail: the userpace avail pointer. - * @used: the userpace used pointer. + * @desc: the userspace descriptor pointer. + * @avail: the userspace avail pointer. + * @used: the userspace used pointer. * * Returns an error if num is invalid. */ @@ -1094,10 +1094,17 @@ EXPORT_SYMBOL(vringh_need_notify_kern); #if IS_REACHABLE(CONFIG_VHOST_IOTLB) +struct iotlb_vec { + union { + struct iovec *iovec; + struct bio_vec *bvec; + } iov; + size_t count; +}; + static int iotlb_translate(const struct vringh *vrh, u64 addr, u64 len, u64 *translated, - struct bio_vec iov[], - int iov_size, u32 perm) + struct iotlb_vec *ivec, u32 perm) { struct vhost_iotlb_map *map; struct vhost_iotlb *iotlb = vrh->iotlb; @@ -1107,9 +1114,11 @@ static int iotlb_translate(const struct vringh *vrh, spin_lock(vrh->iotlb_lock); while (len > s) { - u64 size, pa, pfn; + uintptr_t io_addr; + size_t io_len; + u64 size; - if (unlikely(ret >= iov_size)) { + if (unlikely(ret >= ivec->count)) { ret = -ENOBUFS; break; } @@ -1124,10 +1133,22 @@ static int iotlb_translate(const struct vringh *vrh, } size = map->size - addr + map->start; - pa = map->addr + addr - map->start; - pfn = pa >> PAGE_SHIFT; - bvec_set_page(&iov[ret], pfn_to_page(pfn), min(len - s, size), - pa & (PAGE_SIZE - 1)); + io_len = min(len - s, size); + io_addr = map->addr - map->start + addr; + + if (vrh->use_va) { + struct iovec *iovec = ivec->iov.iovec; + + iovec[ret].iov_len = io_len; + iovec[ret].iov_base = (void __user *)io_addr; + } else { + u64 pfn = io_addr >> PAGE_SHIFT; + struct bio_vec *bvec = ivec->iov.bvec; + + bvec_set_page(&bvec[ret], pfn_to_page(pfn), io_len, + io_addr & (PAGE_SIZE - 1)); + } + s += size; addr += size; ++ret; @@ -1141,26 +1162,41 @@ static int iotlb_translate(const struct vringh *vrh, return ret; } +#define IOTLB_IOV_STRIDE 16 + static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { + struct iotlb_vec ivec; + union { + struct iovec iovec[IOTLB_IOV_STRIDE]; + struct bio_vec bvec[IOTLB_IOV_STRIDE]; + } iov; u64 total_translated = 0; + ivec.iov.iovec = iov.iovec; + ivec.count = IOTLB_IOV_STRIDE; + while (total_translated < len) { - struct bio_vec iov[16]; struct iov_iter iter; u64 translated; int ret; ret = iotlb_translate(vrh, (u64)(uintptr_t)src, len - total_translated, &translated, - iov, ARRAY_SIZE(iov), VHOST_MAP_RO); + &ivec, VHOST_MAP_RO); if (ret == -ENOBUFS) - ret = ARRAY_SIZE(iov); + ret = IOTLB_IOV_STRIDE; else if (ret < 0) return ret; - iov_iter_bvec(&iter, ITER_SOURCE, iov, ret, translated); + if (vrh->use_va) { + iov_iter_init(&iter, ITER_SOURCE, ivec.iov.iovec, ret, + translated); + } else { + iov_iter_bvec(&iter, ITER_SOURCE, ivec.iov.bvec, ret, + translated); + } ret = copy_from_iter(dst, translated, &iter); if (ret < 0) @@ -1177,23 +1213,36 @@ static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { + struct iotlb_vec ivec; + union { + struct iovec iovec[IOTLB_IOV_STRIDE]; + struct bio_vec bvec[IOTLB_IOV_STRIDE]; + } iov; u64 total_translated = 0; + ivec.iov.iovec = iov.iovec; + ivec.count = IOTLB_IOV_STRIDE; + while (total_translated < len) { - struct bio_vec iov[16]; struct iov_iter iter; u64 translated; int ret; ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, len - total_translated, &translated, - iov, ARRAY_SIZE(iov), VHOST_MAP_WO); + &ivec, VHOST_MAP_WO); if (ret == -ENOBUFS) - ret = ARRAY_SIZE(iov); + ret = IOTLB_IOV_STRIDE; else if (ret < 0) return ret; - iov_iter_bvec(&iter, ITER_DEST, iov, ret, translated); + if (vrh->use_va) { + iov_iter_init(&iter, ITER_DEST, ivec.iov.iovec, ret, + translated); + } else { + iov_iter_bvec(&iter, ITER_DEST, ivec.iov.bvec, ret, + translated); + } ret = copy_to_iter(src, translated, &iter); if (ret < 0) @@ -1210,20 +1259,36 @@ static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, static inline int getu16_iotlb(const struct vringh *vrh, u16 *val, const __virtio16 *p) { - struct bio_vec iov; - void *kaddr, *from; + struct iotlb_vec ivec; + union { + struct iovec iovec[1]; + struct bio_vec bvec[1]; + } iov; + __virtio16 tmp; int ret; + ivec.iov.iovec = iov.iovec; + ivec.count = 1; + /* Atomic read is needed for getu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, - &iov, 1, VHOST_MAP_RO); + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + NULL, &ivec, VHOST_MAP_RO); if (ret < 0) return ret; - kaddr = kmap_atomic(iov.bv_page); - from = kaddr + iov.bv_offset; - *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from)); - kunmap_atomic(kaddr); + if (vrh->use_va) { + ret = __get_user(tmp, (__virtio16 __user *)ivec.iov.iovec[0].iov_base); + if (ret) + return ret; + } else { + void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page); + void *from = kaddr + ivec.iov.bvec[0].bv_offset; + + tmp = READ_ONCE(*(__virtio16 *)from); + kunmap_local(kaddr); + } + + *val = vringh16_to_cpu(vrh, tmp); return 0; } @@ -1231,20 +1296,36 @@ static inline int getu16_iotlb(const struct vringh *vrh, static inline int putu16_iotlb(const struct vringh *vrh, __virtio16 *p, u16 val) { - struct bio_vec iov; - void *kaddr, *to; + struct iotlb_vec ivec; + union { + struct iovec iovec; + struct bio_vec bvec; + } iov; + __virtio16 tmp; int ret; + ivec.iov.iovec = &iov.iovec; + ivec.count = 1; + /* Atomic write is needed for putu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, - &iov, 1, VHOST_MAP_WO); + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + NULL, &ivec, VHOST_MAP_RO); if (ret < 0) return ret; - kaddr = kmap_atomic(iov.bv_page); - to = kaddr + iov.bv_offset; - WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val)); - kunmap_atomic(kaddr); + tmp = cpu_to_vringh16(vrh, val); + + if (vrh->use_va) { + ret = __put_user(tmp, (__virtio16 __user *)ivec.iov.iovec[0].iov_base); + if (ret) + return ret; + } else { + void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page); + void *to = kaddr + ivec.iov.bvec[0].bv_offset; + + WRITE_ONCE(*(__virtio16 *)to, tmp); + kunmap_local(kaddr); + } return 0; } @@ -1306,9 +1387,9 @@ static inline int putused_iotlb(const struct vringh *vrh, * @features: the feature bits for this ring. * @num: the number of elements. * @weak_barriers: true if we only need memory barriers, not I/O. - * @desc: the userpace descriptor pointer. - * @avail: the userpace avail pointer. - * @used: the userpace used pointer. + * @desc: the userspace descriptor pointer. + * @avail: the userspace avail pointer. + * @used: the userspace used pointer. * * Returns an error if num is invalid. */ @@ -1318,12 +1399,40 @@ int vringh_init_iotlb(struct vringh *vrh, u64 features, struct vring_avail *avail, struct vring_used *used) { + vrh->use_va = false; + return vringh_init_kern(vrh, features, num, weak_barriers, desc, avail, used); } EXPORT_SYMBOL(vringh_init_iotlb); /** + * vringh_init_iotlb_va - initialize a vringh for a ring with IOTLB containing + * user VA. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userspace descriptor pointer. + * @avail: the userspace avail pointer. + * @used: the userspace used pointer. + * + * Returns an error if num is invalid. + */ +int vringh_init_iotlb_va(struct vringh *vrh, u64 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used) +{ + vrh->use_va = true; + + return vringh_init_kern(vrh, features, num, weak_barriers, + desc, avail, used); +} +EXPORT_SYMBOL(vringh_init_iotlb_va); + +/** * vringh_set_iotlb - initialize a vringh for a ring with IOTLB. * @vrh: the vring * @iotlb: iotlb associated with this vring diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 4b5e883055ee..a46a4a29e929 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -286,6 +286,16 @@ static bool vm_notify(struct virtqueue *vq) return true; } +static bool vm_notify_with_data(struct virtqueue *vq) +{ + struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); + u32 data = vring_notification_data(vq); + + writel(data, vm_dev->base + VIRTIO_MMIO_QUEUE_NOTIFY); + + return true; +} + /* Notify all virtqueues on an interrupt. */ static irqreturn_t vm_interrupt(int irq, void *opaque) { @@ -364,12 +374,18 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in const char *name, bool ctx) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); + bool (*notify)(struct virtqueue *vq); struct virtio_mmio_vq_info *info; struct virtqueue *vq; unsigned long flags; unsigned int num; int err; + if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) + notify = vm_notify_with_data; + else + notify = vm_notify; + if (!name) return NULL; @@ -398,7 +414,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in /* Create the vring */ vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, - true, true, ctx, vm_notify, callback, name); + true, true, ctx, notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 9e496e288cfa..d6bb68ba84e5 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -288,6 +288,15 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) return vp_modern_config_vector(&vp_dev->mdev, vector); } +static bool vp_notify_with_data(struct virtqueue *vq) +{ + u32 data = vring_notification_data(vq); + + iowrite32(data, (void __iomem *)vq->priv); + + return true; +} + static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, struct virtio_pci_vq_info *info, unsigned int index, @@ -298,10 +307,16 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, { struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + bool (*notify)(struct virtqueue *vq); struct virtqueue *vq; u16 num; int err; + if (__virtio_test_bit(&vp_dev->vdev, VIRTIO_F_NOTIFICATION_DATA)) + notify = vp_notify_with_data; + else + notify = vp_notify; + if (index >= vp_modern_get_num_queues(mdev)) return ERR_PTR(-EINVAL); @@ -310,18 +325,13 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); - if (!is_power_of_2(num)) { - dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num); - return ERR_PTR(-EINVAL); - } - info->msix_vector = msix_vec; /* create the vring */ vq = vring_create_virtqueue(index, num, SMP_CACHE_BYTES, &vp_dev->vdev, true, true, ctx, - vp_notify, callback, name); + notify, callback, name); if (!vq) return ERR_PTR(-ENOMEM); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 41144b5246a8..c5310eaf8b46 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -231,10 +231,10 @@ static void vring_free(struct virtqueue *_vq); * Helpers. */ -#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) +#define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq) -static inline bool virtqueue_use_indirect(struct vring_virtqueue *vq, - unsigned int total_sg) +static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, + unsigned int total_sg) { /* * If the host supports indirect descriptor tables, and we have multiple @@ -269,7 +269,7 @@ static inline bool virtqueue_use_indirect(struct vring_virtqueue *vq, * unconditionally on data path. */ -static bool vring_use_dma_api(struct virtio_device *vdev) +static bool vring_use_dma_api(const struct virtio_device *vdev) { if (!virtio_has_dma_quirk(vdev)) return true; @@ -289,7 +289,7 @@ static bool vring_use_dma_api(struct virtio_device *vdev) return false; } -size_t virtio_max_dma_size(struct virtio_device *vdev) +size_t virtio_max_dma_size(const struct virtio_device *vdev) { size_t max_segment_size = SIZE_MAX; @@ -349,7 +349,7 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size, * making all of the arch DMA ops work on the vring device itself * is a mess. */ -static inline struct device *vring_dma_dev(const struct vring_virtqueue *vq) +static struct device *vring_dma_dev(const struct vring_virtqueue *vq) { return vq->dma_dev; } @@ -423,7 +423,7 @@ static void virtqueue_init(struct vring_virtqueue *vq, u32 num) */ static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, - struct vring_desc *desc) + const struct vring_desc *desc) { u16 flags; @@ -784,7 +784,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, } } -static inline bool more_used_split(const struct vring_virtqueue *vq) +static bool more_used_split(const struct vring_virtqueue *vq) { return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->split.vring.used->idx); @@ -854,6 +854,14 @@ static void virtqueue_disable_cb_split(struct virtqueue *_vq) if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; + + /* + * If device triggered an event already it won't trigger one again: + * no need to disable. + */ + if (vq->event_triggered) + return; + if (vq->event) /* TODO: this is a hack. Figure out a cleaner value to write. */ vring_used_event(&vq->split.vring) = 0x0; @@ -1172,18 +1180,18 @@ err: /* * Packed ring specific functions - *_packed(). */ -static inline bool packed_used_wrap_counter(u16 last_used_idx) +static bool packed_used_wrap_counter(u16 last_used_idx) { return !!(last_used_idx & (1 << VRING_PACKED_EVENT_F_WRAP_CTR)); } -static inline u16 packed_last_used(u16 last_used_idx) +static u16 packed_last_used(u16 last_used_idx) { return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR)); } static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, - struct vring_desc_extra *extra) + const struct vring_desc_extra *extra) { u16 flags; @@ -1206,7 +1214,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, } static void vring_unmap_desc_packed(const struct vring_virtqueue *vq, - struct vring_packed_desc *desc) + const struct vring_packed_desc *desc) { u16 flags; @@ -1612,7 +1620,7 @@ static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, return avail == used && used == used_wrap_counter; } -static inline bool more_used_packed(const struct vring_virtqueue *vq) +static bool more_used_packed(const struct vring_virtqueue *vq) { u16 last_used; u16 last_used_idx; @@ -1699,6 +1707,14 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq) if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; + + /* + * If device triggered an event already it won't trigger one again: + * no need to disable. + */ + if (vq->event_triggered) + return; + vq->packed.vring.driver->flags = cpu_to_le16(vq->packed.event_flags_shadow); } @@ -2330,12 +2346,6 @@ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - /* If device triggered an event already it won't trigger one again: - * no need to disable. - */ - if (vq->event_triggered) - return; - if (vq->packed_ring) virtqueue_disable_cb_packed(_vq); else @@ -2752,6 +2762,23 @@ void vring_del_virtqueue(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(vring_del_virtqueue); +u32 vring_notification_data(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 next; + + if (vq->packed_ring) + next = (vq->packed.next_avail_idx & + ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) | + vq->packed.avail_wrap_counter << + VRING_PACKED_EVENT_F_WRAP_CTR; + else + next = vq->split.avail_idx_shadow; + + return next << 16 | _vq->index; +} +EXPORT_SYMBOL_GPL(vring_notification_data); + /* Manipulates transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev) { @@ -2771,6 +2798,8 @@ void vring_transport_features(struct virtio_device *vdev) break; case VIRTIO_F_ORDER_PLATFORM: break; + case VIRTIO_F_NOTIFICATION_DATA: + break; default: /* We don't understand this bit. */ __virtio_clear_bit(vdev, i); @@ -2786,10 +2815,10 @@ EXPORT_SYMBOL_GPL(vring_transport_features); * Returns the size of the vring. This is mainly used for boasting to * userspace. Unlike other operations, this need not be serialized. */ -unsigned int virtqueue_get_vring_size(struct virtqueue *_vq) +unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; } @@ -2819,9 +2848,9 @@ void __virtqueue_unbreak(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(__virtqueue_unbreak); -bool virtqueue_is_broken(struct virtqueue *_vq) +bool virtqueue_is_broken(const struct virtqueue *_vq) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); return READ_ONCE(vq->broken); } @@ -2868,9 +2897,9 @@ void __virtio_unbreak_device(struct virtio_device *dev) } EXPORT_SYMBOL_GPL(__virtio_unbreak_device); -dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq) +dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); @@ -2881,9 +2910,9 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr); -dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq) +dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); @@ -2895,9 +2924,9 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr); -dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq) +dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); @@ -2910,7 +2939,7 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq) EXPORT_SYMBOL_GPL(virtqueue_get_used_addr); /* Only available for split ring */ -const struct vring *virtqueue_get_vring(struct virtqueue *vq) +const struct vring *virtqueue_get_vring(const struct virtqueue *vq) { return &to_vvq(vq)->split.vring; } diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index d7f5af62ddaa..eb6aee8c06b2 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uuid.h> +#include <linux/group_cpus.h> #include <linux/virtio.h> #include <linux/vdpa.h> #include <linux/virtio_config.h> @@ -112,6 +113,17 @@ static bool virtio_vdpa_notify(struct virtqueue *vq) return true; } +static bool virtio_vdpa_notify_with_data(struct virtqueue *vq) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vq->vdev); + const struct vdpa_config_ops *ops = vdpa->config; + u32 data = vring_notification_data(vq); + + ops->kick_vq_with_data(vdpa, data); + + return true; +} + static irqreturn_t virtio_vdpa_config_cb(void *private) { struct virtio_vdpa_device *vd_dev = private; @@ -138,6 +150,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, struct device *dma_dev; const struct vdpa_config_ops *ops = vdpa->config; struct virtio_vdpa_vq_info *info; + bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify; struct vdpa_callback cb; struct virtqueue *vq; u64 desc_addr, driver_addr, device_addr; @@ -154,6 +167,14 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, if (index >= vdpa->nvqs) return ERR_PTR(-ENOENT); + /* We cannot accept VIRTIO_F_NOTIFICATION_DATA without kick_vq_with_data */ + if (__virtio_test_bit(vdev, VIRTIO_F_NOTIFICATION_DATA)) { + if (ops->kick_vq_with_data) + notify = virtio_vdpa_notify_with_data; + else + __virtio_clear_bit(vdev, VIRTIO_F_NOTIFICATION_DATA); + } + /* Queue shouldn't already be set up. */ if (ops->get_vq_ready(vdpa, index)) return ERR_PTR(-ENOENT); @@ -183,8 +204,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, dma_dev = vdpa_get_dma_dev(vdpa); vq = vring_create_virtqueue_dma(index, max_num, align, vdev, true, may_reduce_num, ctx, - virtio_vdpa_notify, callback, - name, dma_dev); + notify, callback, name, dma_dev); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; @@ -195,6 +215,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, /* Setup virtqueue callback */ cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL; cb.private = info; + cb.trigger = NULL; ops->set_vq_cb(vdpa, index, &cb); ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq)); @@ -272,6 +293,66 @@ static void virtio_vdpa_del_vqs(struct virtio_device *vdev) virtio_vdpa_del_vq(vq); } +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) +{ + affd->nr_sets = 1; + affd->set_size[0] = affvecs; +} + +static struct cpumask * +create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) +{ + unsigned int affvecs = 0, curvec, usedvecs, i; + struct cpumask *masks = NULL; + + if (nvecs > affd->pre_vectors + affd->post_vectors) + affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + + if (!affd->calc_sets) + affd->calc_sets = default_calc_sets; + + affd->calc_sets(affd, affvecs); + + if (!affvecs) + return NULL; + + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); + if (!masks) + return NULL; + + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_setall(&masks[curvec]); + + for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { + unsigned int this_vecs = affd->set_size[i]; + int j; + struct cpumask *result = group_cpus_evenly(this_vecs); + + if (!result) { + kfree(masks); + return NULL; + } + + for (j = 0; j < this_vecs; j++) + cpumask_copy(&masks[curvec + j], &result[j]); + kfree(result); + + curvec += this_vecs; + usedvecs += this_vecs; + } + + /* Fill out vectors at the end that don't need affinity */ + if (usedvecs >= affvecs) + curvec = affd->pre_vectors + affvecs; + else + curvec = affd->pre_vectors + usedvecs; + for (; curvec < nvecs; curvec++) + cpumask_setall(&masks[curvec]); + + return masks; +} + static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], @@ -282,9 +363,15 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev); struct vdpa_device *vdpa = vd_get_vdpa(vdev); const struct vdpa_config_ops *ops = vdpa->config; + struct irq_affinity default_affd = { 0 }; + struct cpumask *masks; struct vdpa_callback cb; int i, err, queue_idx = 0; + masks = create_affinity_masks(nvqs, desc ? desc : &default_affd); + if (!masks) + return -ENOMEM; + for (i = 0; i < nvqs; ++i) { if (!names[i]) { vqs[i] = NULL; @@ -298,6 +385,7 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs, err = PTR_ERR(vqs[i]); goto err_setup_vq; } + ops->set_vq_affinity(vdpa, i, &masks[i]); } cb.callback = virtio_vdpa_config_cb; @@ -337,6 +425,32 @@ static const char *virtio_vdpa_bus_name(struct virtio_device *vdev) return dev_name(&vdpa->dev); } +static int virtio_vdpa_set_vq_affinity(struct virtqueue *vq, + const struct cpumask *cpu_mask) +{ + struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev); + struct vdpa_device *vdpa = vd_dev->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + unsigned int index = vq->index; + + if (ops->set_vq_affinity) + return ops->set_vq_affinity(vdpa, index, cpu_mask); + + return 0; +} + +static const struct cpumask * +virtio_vdpa_get_vq_affinity(struct virtio_device *vdev, int index) +{ + struct vdpa_device *vdpa = vd_get_vdpa(vdev); + const struct vdpa_config_ops *ops = vdpa->config; + + if (ops->get_vq_affinity) + return ops->get_vq_affinity(vdpa, index); + + return NULL; +} + static const struct virtio_config_ops virtio_vdpa_config_ops = { .get = virtio_vdpa_get, .set = virtio_vdpa_set, @@ -349,6 +463,8 @@ static const struct virtio_config_ops virtio_vdpa_config_ops = { .get_features = virtio_vdpa_get_features, .finalize_features = virtio_vdpa_finalize_features, .bus_name = virtio_vdpa_bus_name, + .set_vq_affinity = virtio_vdpa_set_vq_affinity, + .get_vq_affinity = virtio_vdpa_get_vq_affinity, }; static void virtio_vdpa_release_dev(struct device *_d) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 43f59ef10cc9..db1b0eaef4eb 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -10,13 +10,19 @@ #include <linux/if_ether.h> /** - * struct vdpa_calllback - vDPA callback definition. + * struct vdpa_callback - vDPA callback definition. * @callback: interrupt callback function * @private: the data passed to the callback function + * @trigger: the eventfd for the callback (Optional). + * When it is set, the vDPA driver must guarantee that + * signaling it is functional equivalent to triggering + * the callback. Then vDPA parent can signal it directly + * instead of triggering the callback. */ struct vdpa_callback { irqreturn_t (*callback)(void *data); void *private; + struct eventfd_ctx *trigger; }; /** @@ -114,7 +120,7 @@ struct vdpa_dev_set_config { }; /** - * Corresponding file area for device memory mapping + * struct vdpa_map_file - file area for device memory mapping * @file: vma->vm_file for the mapping * @offset: mapping offset in the vm_file */ @@ -143,6 +149,14 @@ struct vdpa_map_file { * @kick_vq: Kick the virtqueue * @vdev: vdpa device * @idx: virtqueue index + * @kick_vq_with_data: Kick the virtqueue and supply extra data + * (only if VIRTIO_F_NOTIFICATION_DATA is negotiated) + * @vdev: vdpa device + * @data for split virtqueue: + * 16 bits vqn and 16 bits next available index. + * @data for packed virtqueue: + * 16 bits vqn, 15 least significant bits of + * next available index and 1 bit next_wrap. * @set_vq_cb: Set the interrupt callback function for * a virtqueue * @vdev: vdpa device @@ -165,10 +179,16 @@ struct vdpa_map_file { * @vdev: vdpa device * @idx: virtqueue index * @state: pointer to returned state (last_avail_idx) + * @get_vendor_vq_stats: Get the vendor statistics of a device. + * @vdev: vdpa device + * @idx: virtqueue index + * @msg: socket buffer holding stats message + * @extack: extack for reporting error messages + * Returns integer: success (0) or error (< 0) * @get_vq_notification: Get the notification area for a virtqueue (optional) * @vdev: vdpa device * @idx: virtqueue index - * Returns the notifcation area + * Returns the notification area * @get_vq_irq: Get the irq number of a virtqueue (optional, * but must implemented if require vq irq offloading) * @vdev: vdpa device @@ -250,6 +270,15 @@ struct vdpa_map_file { * @vdev: vdpa device * Returns the iova range supported by * the device. + * @set_vq_affinity: Set the affinity of virtqueue (optional) + * @vdev: vdpa device + * @idx: virtqueue index + * @cpu_mask: the affinity mask + * Returns integer: success (0) or error (< 0) + * @get_vq_affinity: Get the affinity of virtqueue (optional) + * @vdev: vdpa device + * @idx: virtqueue index + * Returns the affinity mask * @set_group_asid: Set address space identifier for a * virtqueue group (optional) * @vdev: vdpa device @@ -290,6 +319,14 @@ struct vdpa_map_file { * @vdev: vdpa device * @idx: virtqueue index * Returns pointer to structure device or error (NULL) + * @bind_mm: Bind the device to a specific address space + * so the vDPA framework can use VA when this + * callback is implemented. (optional) + * @vdev: vdpa device + * @mm: address space to bind + * @unbind_mm: Unbind the device from the address space + * bound using the bind_mm callback. (optional) + * @vdev: vdpa device * @free: Free resources that belongs to vDPA (optional) * @vdev: vdpa device */ @@ -300,6 +337,7 @@ struct vdpa_config_ops { u64 device_area); void (*set_vq_num)(struct vdpa_device *vdev, u16 idx, u32 num); void (*kick_vq)(struct vdpa_device *vdev, u16 idx); + void (*kick_vq_with_data)(struct vdpa_device *vdev, u32 data); void (*set_vq_cb)(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb); void (*set_vq_ready)(struct vdpa_device *vdev, u16 idx, bool ready); @@ -340,6 +378,10 @@ struct vdpa_config_ops { const void *buf, unsigned int len); u32 (*get_generation)(struct vdpa_device *vdev); struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev); + int (*set_vq_affinity)(struct vdpa_device *vdev, u16 idx, + const struct cpumask *cpu_mask); + const struct cpumask *(*get_vq_affinity)(struct vdpa_device *vdev, + u16 idx); /* DMA ops */ int (*set_map)(struct vdpa_device *vdev, unsigned int asid, @@ -351,6 +393,8 @@ struct vdpa_config_ops { int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group, unsigned int asid); struct device *(*get_vq_dma_dev)(struct vdpa_device *vdev, u16 idx); + int (*bind_mm)(struct vdpa_device *vdev, struct mm_struct *mm); + void (*unbind_mm)(struct vdpa_device *vdev); /* Free device resources */ void (*free)(struct vdpa_device *vdev); @@ -506,6 +550,8 @@ struct vdpa_mgmtdev_ops { * @config_attr_mask: bit mask of attributes of type enum vdpa_attr that * management device support during dev_add callback * @list: list entry + * @supported_features: features supported by device + * @max_supported_vqs: maximum number of virtqueues supported by device */ struct vdpa_mgmt_dev { struct device *device; diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 2b472514c49b..b93238db94e3 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -34,8 +34,8 @@ struct virtqueue { unsigned int index; unsigned int num_free; unsigned int num_max; - void *priv; bool reset; + void *priv; }; int virtqueue_add_outbuf(struct virtqueue *vq, @@ -84,14 +84,14 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *vq); void *virtqueue_detach_unused_buf(struct virtqueue *vq); -unsigned int virtqueue_get_vring_size(struct virtqueue *vq); +unsigned int virtqueue_get_vring_size(const struct virtqueue *vq); -bool virtqueue_is_broken(struct virtqueue *vq); +bool virtqueue_is_broken(const struct virtqueue *vq); -const struct vring *virtqueue_get_vring(struct virtqueue *vq); -dma_addr_t virtqueue_get_desc_addr(struct virtqueue *vq); -dma_addr_t virtqueue_get_avail_addr(struct virtqueue *vq); -dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq); +const struct vring *virtqueue_get_vring(const struct virtqueue *vq); +dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *vq); +dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *vq); +dma_addr_t virtqueue_get_used_addr(const struct virtqueue *vq); int virtqueue_resize(struct virtqueue *vq, u32 num, void (*recycle)(struct virtqueue *vq, void *buf)); @@ -147,7 +147,7 @@ int virtio_device_restore(struct virtio_device *dev); #endif void virtio_reset_device(struct virtio_device *dev); -size_t virtio_max_dma_size(struct virtio_device *vdev); +size_t virtio_max_dma_size(const struct virtio_device *vdev); #define virtio_device_for_each_vq(vdev, vq) \ list_for_each_entry(vq, &vdev->vqs, list) diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 8b95b69ef694..9b33df741b63 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -58,6 +58,7 @@ do { \ struct virtio_device; struct virtqueue; +struct device; /* * Creates a virtqueue and allocates the descriptor ring. If @@ -117,4 +118,6 @@ void vring_del_virtqueue(struct virtqueue *vq); void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); + +u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */ diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 1991a02c6431..c3a8117dabe8 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -32,6 +32,9 @@ struct vringh { /* Can we get away with weak barriers? */ bool weak_barriers; + /* Use user's VA */ + bool use_va; + /* Last available index we saw (ie. where we're up to). */ u16 last_avail_idx; @@ -54,6 +57,9 @@ struct vringh { void (*notify)(struct vringh *); }; +struct virtio_device; +typedef void vrh_callback_t(struct virtio_device *, struct vringh *); + /** * struct vringh_config_ops - ops for creating a host vring from a virtio driver * @find_vrhs: find the host vrings and instantiate them @@ -65,8 +71,6 @@ struct vringh { * Returns 0 on success or error status * @del_vrhs: free the host vrings found by find_vrhs(). */ -struct virtio_device; -typedef void vrh_callback_t(struct virtio_device *, struct vringh *); struct vringh_config_ops { int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs, struct vringh *vrhs[], vrh_callback_t *callbacks[]); @@ -81,6 +85,12 @@ struct vringh_range { /** * struct vringh_iov - iovec mangler. + * @iov: array of iovecs to operate on + * @consumed: number of bytes consumed within iov[i] + * @i: index of current iovec + * @used: number of iovecs present in @iov + * @max_num: maximum number of iovecs. + * corresponds to allocated memory of @iov * * Mangles iovec in place, and restores it. * Remaining data is iov + i, of used - i elements. @@ -93,6 +103,12 @@ struct vringh_iov { /** * struct vringh_kiov - kvec mangler. + * @iov: array of iovecs to operate on + * @consumed: number of bytes consumed within iov[i] + * @i: index of current iovec + * @used: number of iovecs present in @iov + * @max_num: maximum number of iovecs. + * corresponds to allocated memory of @iov * * Mangles kvec in place, and restores it. * Remaining data is iov + i, of used - i elements. @@ -284,6 +300,12 @@ int vringh_init_iotlb(struct vringh *vrh, u64 features, struct vring_avail *avail, struct vring_used *used); +int vringh_init_iotlb_va(struct vringh *vrh, u64 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used); + int vringh_getdesc_iotlb(struct vringh *vrh, struct vringh_kiov *riov, struct vringh_kiov *wiov, diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index 3c05162bc988..2c712c654165 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -100,6 +100,12 @@ #define VIRTIO_F_SR_IOV 37 /* + * This feature indicates that the driver passes extra data (besides + * identifying the virtqueue) in its device notifications. + */ +#define VIRTIO_F_NOTIFICATION_DATA 38 + +/* * This feature indicates that the driver can reset a queue individually. */ #define VIRTIO_F_RING_RESET 40 diff --git a/lib/group_cpus.c b/lib/group_cpus.c index 9c837a35fef7..aa3f6815bb12 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -426,3 +426,4 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) return masks; } #endif /* CONFIG_SMP */ +EXPORT_SYMBOL_GPL(group_cpus_evenly); diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h index 051fdeaf2670..8519386acd23 100644 --- a/tools/include/linux/types.h +++ b/tools/include/linux/types.h @@ -49,7 +49,12 @@ typedef __s8 s8; #endif #define __force +/* This is defined in linux/compiler_types.h and is left for backward + * compatibility. + */ +#ifndef __user #define __user +#endif #define __must_check #define __cold diff --git a/tools/virtio/linux/compiler.h b/tools/virtio/linux/compiler.h index 2c51bccb97bb..1f3a15b954b9 100644 --- a/tools/virtio/linux/compiler.h +++ b/tools/virtio/linux/compiler.h @@ -2,6 +2,8 @@ #ifndef LINUX_COMPILER_H #define LINUX_COMPILER_H +#include "../../../include/linux/compiler_types.h" + #define WRITE_ONCE(var, val) \ (*((volatile typeof(val) *)(&(var))) = (val)) diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h index 8b877167933d..6702008f7f5c 100644 --- a/tools/virtio/linux/kernel.h +++ b/tools/virtio/linux/kernel.h @@ -10,6 +10,7 @@ #include <stdarg.h> #include <linux/compiler.h> +#include "../../../include/linux/container_of.h" #include <linux/log2.h> #include <linux/types.h> #include <linux/overflow.h> @@ -107,10 +108,6 @@ static inline void free_page(unsigned long addr) free((void *)addr); } -#define container_of(ptr, type, member) ({ \ - const typeof( ((type *)0)->member ) *__mptr = (ptr); \ - (type *)( (char *)__mptr - offsetof(type,member) );}) - # ifndef likely # define likely(x) (__builtin_expect(!!(x), 1)) # endif diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h index 991dfb263998..f13828e0c409 100644 --- a/tools/virtio/linux/uaccess.h +++ b/tools/virtio/linux/uaccess.h @@ -6,15 +6,10 @@ extern void *__user_addr_min, *__user_addr_max; -static inline void __chk_user_ptr(const volatile void *p, size_t size) -{ - assert(p >= __user_addr_min && p + size <= __user_addr_max); -} - #define put_user(x, ptr) \ ({ \ typeof(ptr) __pu_ptr = (ptr); \ - __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ + __chk_user_ptr(__pu_ptr); \ WRITE_ONCE(*(__pu_ptr), x); \ 0; \ }) @@ -22,7 +17,7 @@ static inline void __chk_user_ptr(const volatile void *p, size_t size) #define get_user(x, ptr) \ ({ \ typeof(ptr) __pu_ptr = (ptr); \ - __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \ + __chk_user_ptr(__pu_ptr); \ x = READ_ONCE(*(__pu_ptr)); \ 0; \ }) @@ -37,7 +32,6 @@ static void volatile_memcpy(volatile char *to, const volatile char *from, static inline int copy_from_user(void *to, const void __user volatile *from, unsigned long n) { - __chk_user_ptr(from, n); volatile_memcpy(to, from, n); return 0; } @@ -45,7 +39,6 @@ static inline int copy_from_user(void *to, const void __user volatile *from, static inline int copy_to_user(void __user volatile *to, const void *from, unsigned long n) { - __chk_user_ptr(to, n); volatile_memcpy(to, from, n); return 0; } diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index 120062f94590..028f54e6854a 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -134,7 +134,7 @@ static void vdev_info_init(struct vdev_info* dev, unsigned long long features) dev->buf_size = 1024; dev->buf = malloc(dev->buf_size); assert(dev->buf); - dev->control = open("/dev/vhost-test", O_RDWR); + dev->control = open("/dev/vhost-test", O_RDWR); assert(dev->control >= 0); r = ioctl(dev->control, VHOST_SET_OWNER, NULL); assert(r >= 0); @@ -327,7 +327,7 @@ const struct option longopts[] = { } }; -static void help(void) +static void help(int status) { fprintf(stderr, "Usage: virtio_test [--help]" " [--no-indirect]" @@ -337,6 +337,8 @@ static void help(void) " [--batch=random/N]" " [--reset=N]" "\n"); + + exit(status); } int main(int argc, char **argv) @@ -354,14 +356,12 @@ int main(int argc, char **argv) case -1: goto done; case '?': - help(); - exit(2); + help(2); case 'e': features &= ~(1ULL << VIRTIO_RING_F_EVENT_IDX); break; case 'h': - help(); - goto done; + help(0); case 'i': features &= ~(1ULL << VIRTIO_RING_F_INDIRECT_DESC); break; |