diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-12 09:50:34 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-12 09:50:34 -0700 |
commit | 7a53e17accce9d310d2e522dfc701d8da7ccfa65 (patch) | |
tree | c1ccf061aee42178159cbe9c31c2c4e004b76947 /drivers | |
parent | 999324f58c41262f5b64d04b7ac54e8f79b019fd (diff) | |
parent | 93e530d2a1c4c0fcce45e01ae6c5c6287a08d3e3 (diff) |
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio updates from Michael Tsirkin:
- A huge patchset supporting vq resize using the new vq reset
capability
- Features, fixes, and cleanups all over the place
* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (88 commits)
vdpa/mlx5: Fix possible uninitialized return value
vdpa_sim_blk: add support for discard and write-zeroes
vdpa_sim_blk: add support for VIRTIO_BLK_T_FLUSH
vdpa_sim_blk: make vdpasim_blk_check_range usable by other requests
vdpa_sim_blk: check if sector is 0 for commands other than read or write
vdpa_sim: Implement suspend vdpa op
vhost-vdpa: uAPI to suspend the device
vhost-vdpa: introduce SUSPEND backend feature bit
vdpa: Add suspend operation
virtio-blk: Avoid use-after-free on suspend/resume
virtio_vdpa: support the arg sizes of find_vqs()
vhost-vdpa: Call ida_simple_remove() when failed
vDPA: fix 'cast to restricted le16' warnings in vdpa.c
vDPA: !FEATURES_OK should not block querying device config space
vDPA/ifcvf: support userspace to query features and MQ of a management device
vDPA/ifcvf: get_config_size should return a value no greater than dev implementation
vhost scsi: Allow user to control num virtqueues
vhost-scsi: Fix max number of virtqueues
vdpa/mlx5: Support different address spaces for control and data
vdpa/mlx5: Implement susupend virtqueue callback
...
Diffstat (limited to 'drivers')
33 files changed, 1971 insertions, 510 deletions
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index d7d72e8f6e55..30255fcaf181 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -101,6 +101,14 @@ static inline blk_status_t virtblk_result(struct virtblk_req *vbr) } } +static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num]; + + return vq; +} + static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) { struct scatterlist hdr, status, *sgs[3]; @@ -416,7 +424,7 @@ static void virtio_queue_rqs(struct request **rqlist) struct request *requeue_list = NULL; rq_list_for_each_safe(rqlist, req, next) { - struct virtio_blk_vq *vq = req->mq_hctx->driver_data; + struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx); bool kick; if (!virtblk_prep_rq_batch(req)) { @@ -837,7 +845,7 @@ static void virtblk_complete_batch(struct io_comp_batch *iob) static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct virtio_blk *vblk = hctx->queue->queuedata; - struct virtio_blk_vq *vq = hctx->driver_data; + struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx); struct virtblk_req *vbr; unsigned long flags; unsigned int len; @@ -862,22 +870,10 @@ static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) return found; } -static int virtblk_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) -{ - struct virtio_blk *vblk = data; - struct virtio_blk_vq *vq = &vblk->vqs[hctx_idx]; - - WARN_ON(vblk->tag_set.tags[hctx_idx] != hctx->tags); - hctx->driver_data = vq; - return 0; -} - static const struct blk_mq_ops virtio_mq_ops = { .queue_rq = virtio_queue_rq, .queue_rqs = virtio_queue_rqs, .commit_rqs = virtio_commit_rqs, - .init_hctx = virtblk_init_hctx, .complete = virtblk_request_done, .map_queues = virtblk_map_queues, .poll = virtblk_poll, diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3b3eebad3977..d934774e9733 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -135,6 +135,9 @@ struct send_queue { struct virtnet_sq_stats stats; struct napi_struct napi; + + /* Record whether sq is in reset state. */ + bool reset; }; /* Internal representation of a receive virtqueue */ @@ -267,6 +270,12 @@ struct virtnet_info { u8 duplex; u32 speed; + /* Interrupt coalescing settings */ + u32 tx_usecs; + u32 rx_usecs; + u32 tx_max_packets; + u32 rx_max_packets; + unsigned long guest_offloads; unsigned long guest_offloads_capable; @@ -284,6 +293,9 @@ struct padded_vnet_hdr { char padding[12]; }; +static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf); +static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf); + static bool is_xdp_frame(void *ptr) { return (unsigned long)ptr & VIRTIO_XDP_FLAG; @@ -1628,6 +1640,11 @@ static void virtnet_poll_cleantx(struct receive_queue *rq) return; if (__netif_tx_trylock(txq)) { + if (sq->reset) { + __netif_tx_unlock(txq); + return; + } + do { virtqueue_disable_cb(sq->vq); free_old_xmit_skbs(sq, true); @@ -1875,6 +1892,70 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +static int virtnet_rx_resize(struct virtnet_info *vi, + struct receive_queue *rq, u32 ring_num) +{ + bool running = netif_running(vi->dev); + int err, qindex; + + qindex = rq - vi->rq; + + if (running) + napi_disable(&rq->napi); + + err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_free_unused_buf); + if (err) + netdev_err(vi->dev, "resize rx fail: rx queue index: %d err: %d\n", qindex, err); + + if (!try_fill_recv(vi, rq, GFP_KERNEL)) + schedule_delayed_work(&vi->refill, 0); + + if (running) + virtnet_napi_enable(rq->vq, &rq->napi); + return err; +} + +static int virtnet_tx_resize(struct virtnet_info *vi, + struct send_queue *sq, u32 ring_num) +{ + bool running = netif_running(vi->dev); + struct netdev_queue *txq; + int err, qindex; + + qindex = sq - vi->sq; + + if (running) + virtnet_napi_tx_disable(&sq->napi); + + txq = netdev_get_tx_queue(vi->dev, qindex); + + /* 1. wait all ximt complete + * 2. fix the race of netif_stop_subqueue() vs netif_start_subqueue() + */ + __netif_tx_lock_bh(txq); + + /* Prevent rx poll from accessing sq. */ + sq->reset = true; + + /* Prevent the upper layer from trying to send packets. */ + netif_stop_subqueue(vi->dev, qindex); + + __netif_tx_unlock_bh(txq); + + err = virtqueue_resize(sq->vq, ring_num, virtnet_sq_free_unused_buf); + if (err) + netdev_err(vi->dev, "resize tx fail: tx queue index: %d err: %d\n", qindex, err); + + __netif_tx_lock_bh(txq); + sq->reset = false; + netif_tx_wake_queue(txq); + __netif_tx_unlock_bh(txq); + + if (running) + virtnet_napi_tx_enable(vi, sq->vq, &sq->napi); + return err; +} + /* * Send command via the control virtqueue and check status. Commands * supported by the hypervisor, as indicated by feature bits, should @@ -2285,10 +2366,57 @@ static void virtnet_get_ringparam(struct net_device *dev, { struct virtnet_info *vi = netdev_priv(dev); - ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq); - ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq); - ring->rx_pending = ring->rx_max_pending; - ring->tx_pending = ring->tx_max_pending; + ring->rx_max_pending = vi->rq[0].vq->num_max; + ring->tx_max_pending = vi->sq[0].vq->num_max; + ring->rx_pending = virtqueue_get_vring_size(vi->rq[0].vq); + ring->tx_pending = virtqueue_get_vring_size(vi->sq[0].vq); +} + +static int virtnet_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *ring, + struct kernel_ethtool_ringparam *kernel_ring, + struct netlink_ext_ack *extack) +{ + struct virtnet_info *vi = netdev_priv(dev); + u32 rx_pending, tx_pending; + struct receive_queue *rq; + struct send_queue *sq; + int i, err; + + if (ring->rx_mini_pending || ring->rx_jumbo_pending) + return -EINVAL; + + rx_pending = virtqueue_get_vring_size(vi->rq[0].vq); + tx_pending = virtqueue_get_vring_size(vi->sq[0].vq); + + if (ring->rx_pending == rx_pending && + ring->tx_pending == tx_pending) + return 0; + + if (ring->rx_pending > vi->rq[0].vq->num_max) + return -EINVAL; + + if (ring->tx_pending > vi->sq[0].vq->num_max) + return -EINVAL; + + for (i = 0; i < vi->max_queue_pairs; i++) { + rq = vi->rq + i; + sq = vi->sq + i; + + if (ring->tx_pending != tx_pending) { + err = virtnet_tx_resize(vi, sq, ring->tx_pending); + if (err) + return err; + } + + if (ring->rx_pending != rx_pending) { + err = virtnet_rx_resize(vi, rq, ring->rx_pending); + if (err) + return err; + } + } + + return 0; } static bool virtnet_commit_rss_command(struct virtnet_info *vi) @@ -2618,27 +2746,89 @@ static int virtnet_get_link_ksettings(struct net_device *dev, return 0; } +static int virtnet_send_notf_coal_cmds(struct virtnet_info *vi, + struct ethtool_coalesce *ec) +{ + struct scatterlist sgs_tx, sgs_rx; + struct virtio_net_ctrl_coal_tx coal_tx; + struct virtio_net_ctrl_coal_rx coal_rx; + + coal_tx.tx_usecs = cpu_to_le32(ec->tx_coalesce_usecs); + coal_tx.tx_max_packets = cpu_to_le32(ec->tx_max_coalesced_frames); + sg_init_one(&sgs_tx, &coal_tx, sizeof(coal_tx)); + + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, + VIRTIO_NET_CTRL_NOTF_COAL_TX_SET, + &sgs_tx)) + return -EINVAL; + + /* Save parameters */ + vi->tx_usecs = ec->tx_coalesce_usecs; + vi->tx_max_packets = ec->tx_max_coalesced_frames; + + coal_rx.rx_usecs = cpu_to_le32(ec->rx_coalesce_usecs); + coal_rx.rx_max_packets = cpu_to_le32(ec->rx_max_coalesced_frames); + sg_init_one(&sgs_rx, &coal_rx, sizeof(coal_rx)); + + if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_NOTF_COAL, + VIRTIO_NET_CTRL_NOTF_COAL_RX_SET, + &sgs_rx)) + return -EINVAL; + + /* Save parameters */ + vi->rx_usecs = ec->rx_coalesce_usecs; + vi->rx_max_packets = ec->rx_max_coalesced_frames; + + return 0; +} + +static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) +{ + /* usecs coalescing is supported only if VIRTIO_NET_F_NOTF_COAL + * feature is negotiated. + */ + if (ec->rx_coalesce_usecs || ec->tx_coalesce_usecs) + return -EOPNOTSUPP; + + if (ec->tx_max_coalesced_frames > 1 || + ec->rx_max_coalesced_frames != 1) + return -EINVAL; + + return 0; +} + static int virtnet_set_coalesce(struct net_device *dev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { struct virtnet_info *vi = netdev_priv(dev); - int i, napi_weight; - - if (ec->tx_max_coalesced_frames > 1 || - ec->rx_max_coalesced_frames != 1) - return -EINVAL; + int ret, i, napi_weight; + bool update_napi = false; + /* Can't change NAPI weight if the link is up */ napi_weight = ec->tx_max_coalesced_frames ? NAPI_POLL_WEIGHT : 0; if (napi_weight ^ vi->sq[0].napi.weight) { if (dev->flags & IFF_UP) return -EBUSY; + else + update_napi = true; + } + + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) + ret = virtnet_send_notf_coal_cmds(vi, ec); + else + ret = virtnet_coal_params_supported(ec); + + if (ret) + return ret; + + if (update_napi) { for (i = 0; i < vi->max_queue_pairs; i++) vi->sq[i].napi.weight = napi_weight; } - return 0; + return ret; } static int virtnet_get_coalesce(struct net_device *dev, @@ -2646,16 +2836,19 @@ static int virtnet_get_coalesce(struct net_device *dev, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { - struct ethtool_coalesce ec_default = { - .cmd = ETHTOOL_GCOALESCE, - .rx_max_coalesced_frames = 1, - }; struct virtnet_info *vi = netdev_priv(dev); - memcpy(ec, &ec_default, sizeof(ec_default)); + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { + ec->rx_coalesce_usecs = vi->rx_usecs; + ec->tx_coalesce_usecs = vi->tx_usecs; + ec->tx_max_coalesced_frames = vi->tx_max_packets; + ec->rx_max_coalesced_frames = vi->rx_max_packets; + } else { + ec->rx_max_coalesced_frames = 1; - if (vi->sq[0].napi.weight) - ec->tx_max_coalesced_frames = 1; + if (vi->sq[0].napi.weight) + ec->tx_max_coalesced_frames = 1; + } return 0; } @@ -2774,10 +2967,12 @@ static int virtnet_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info) } static const struct ethtool_ops virtnet_ethtool_ops = { - .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES, + .supported_coalesce_params = ETHTOOL_COALESCE_MAX_FRAMES | + ETHTOOL_COALESCE_USECS, .get_drvinfo = virtnet_get_drvinfo, .get_link = ethtool_op_get_link, .get_ringparam = virtnet_get_ringparam, + .set_ringparam = virtnet_set_ringparam, .get_strings = virtnet_get_strings, .get_sset_count = virtnet_get_sset_count, .get_ethtool_stats = virtnet_get_ethtool_stats, @@ -3171,6 +3366,27 @@ static void free_receive_page_frags(struct virtnet_info *vi) put_page(vi->rq[i].alloc_frag.page); } +static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf) +{ + if (!is_xdp_frame(buf)) + dev_kfree_skb(buf); + else + xdp_return_frame(ptr_to_xdp(buf)); +} + +static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf) +{ + struct virtnet_info *vi = vq->vdev->priv; + int i = vq2rxq(vq); + + if (vi->mergeable_rx_bufs) + put_page(virt_to_head_page(buf)); + else if (vi->big_packets) + give_pages(&vi->rq[i], buf); + else + put_page(virt_to_head_page(buf)); +} + static void free_unused_bufs(struct virtnet_info *vi) { void *buf; @@ -3178,26 +3394,14 @@ static void free_unused_bufs(struct virtnet_info *vi) for (i = 0; i < vi->max_queue_pairs; i++) { struct virtqueue *vq = vi->sq[i].vq; - while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { - if (!is_xdp_frame(buf)) - dev_kfree_skb(buf); - else - xdp_return_frame(ptr_to_xdp(buf)); - } + while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) + virtnet_sq_free_unused_buf(vq, buf); } for (i = 0; i < vi->max_queue_pairs; i++) { struct virtqueue *vq = vi->rq[i].vq; - - while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { - if (vi->mergeable_rx_bufs) { - put_page(virt_to_head_page(buf)); - } else if (vi->big_packets) { - give_pages(&vi->rq[i], buf); - } else { - put_page(virt_to_head_page(buf)); - } - } + while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) + virtnet_rq_free_unused_buf(vq, buf); } } @@ -3228,6 +3432,29 @@ static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqu (unsigned int)GOOD_PACKET_LEN); } +static void virtnet_config_sizes(struct virtnet_info *vi, u32 *sizes) +{ + u32 i, rx_size, tx_size; + + if (vi->speed == SPEED_UNKNOWN || vi->speed < SPEED_10000) { + rx_size = 1024; + tx_size = 1024; + + } else if (vi->speed < SPEED_40000) { + rx_size = 1024 * 4; + tx_size = 1024 * 4; + + } else { + rx_size = 1024 * 8; + tx_size = 1024 * 8; + } + + for (i = 0; i < vi->max_queue_pairs; i++) { + sizes[rxq2vq(i)] = rx_size; + sizes[txq2vq(i)] = tx_size; + } +} + static int virtnet_find_vqs(struct virtnet_info *vi) { vq_callback_t **callbacks; @@ -3235,6 +3462,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi) int ret = -ENOMEM; int i, total_vqs; const char **names; + u32 *sizes; bool *ctx; /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by @@ -3262,10 +3490,15 @@ static int virtnet_find_vqs(struct virtnet_info *vi) ctx = NULL; } + sizes = kmalloc_array(total_vqs, sizeof(*sizes), GFP_KERNEL); + if (!sizes) + goto err_sizes; + /* Parameters for control virtqueue, if any */ if (vi->has_cvq) { callbacks[total_vqs - 1] = NULL; names[total_vqs - 1] = "control"; + sizes[total_vqs - 1] = 64; } /* Allocate/initialize parameters for send/receive virtqueues */ @@ -3280,8 +3513,10 @@ static int virtnet_find_vqs(struct virtnet_info *vi) ctx[rxq2vq(i)] = true; } - ret = virtio_find_vqs_ctx(vi->vdev, total_vqs, vqs, callbacks, - names, ctx, NULL); + virtnet_config_sizes(vi, sizes); + + ret = virtio_find_vqs_ctx_size(vi->vdev, total_vqs, vqs, callbacks, + names, sizes, ctx, NULL); if (ret) goto err_find; @@ -3301,6 +3536,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi) err_find: + kfree(sizes); +err_sizes: kfree(ctx); err_ctx: kfree(names); @@ -3444,6 +3681,8 @@ static bool virtnet_validate_features(struct virtio_device *vdev) VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_RSS, "VIRTIO_NET_F_CTRL_VQ") || VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_HASH_REPORT, + "VIRTIO_NET_F_CTRL_VQ") || + VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_NOTF_COAL, "VIRTIO_NET_F_CTRL_VQ"))) { return false; } @@ -3580,6 +3819,13 @@ static int virtnet_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) vi->mergeable_rx_bufs = true; + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { + vi->rx_usecs = 0; + vi->tx_usecs = 0; + vi->tx_max_packets = 0; + vi->rx_max_packets = 0; + } + if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) vi->has_rss_hash_report = true; @@ -3651,6 +3897,9 @@ static int virtnet_probe(struct virtio_device *vdev) vi->curr_queue_pairs = num_online_cpus(); vi->max_queue_pairs = max_queue_pairs; + virtnet_init_settings(dev); + virtnet_update_settings(vi); + /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ err = init_vqs(vi); if (err) @@ -3663,8 +3912,6 @@ static int virtnet_probe(struct virtio_device *vdev) netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs); netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs); - virtnet_init_settings(dev); - if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) { vi->failover = net_failover_create(vi->dev); if (IS_ERR(vi->failover)) { @@ -3814,7 +4061,7 @@ static struct virtio_device_id id_table[] = { VIRTIO_NET_F_CTRL_MAC_ADDR, \ VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \ - VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT + VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL static unsigned int features[] = { VIRTNET_FEATURES, diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c index 995b6cdc67ed..20da455d2ef6 100644 --- a/drivers/nvdimm/virtio_pmem.c +++ b/drivers/nvdimm/virtio_pmem.c @@ -81,17 +81,24 @@ static int virtio_pmem_probe(struct virtio_device *vdev) ndr_desc.res = &res; ndr_desc.numa_node = nid; ndr_desc.flush = async_pmem_flush; + ndr_desc.provider_data = vdev; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + /* + * The NVDIMM region could be available before the + * virtio_device_ready() that is called by + * virtio_dev_probe(), so we set device ready here. + */ + virtio_device_ready(vdev); nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); if (!nd_region) { dev_err(&vdev->dev, "failed to create nvdimm region\n"); err = -ENXIO; goto out_nd; } - nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent); return 0; out_nd: + virtio_reset_device(vdev); nvdimm_bus_unregister(vpmem->nvdimm_bus); out_vq: vdev->config->del_vqs(vdev); diff --git a/drivers/platform/mellanox/mlxbf-tmfifo.c b/drivers/platform/mellanox/mlxbf-tmfifo.c index 38800e86ed8a..8be13d416f48 100644 --- a/drivers/platform/mellanox/mlxbf-tmfifo.c +++ b/drivers/platform/mellanox/mlxbf-tmfifo.c @@ -928,6 +928,7 @@ static int mlxbf_tmfifo_virtio_find_vqs(struct virtio_device *vdev, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], + u32 sizes[], const bool *ctx, struct irq_affinity *desc) { @@ -959,6 +960,8 @@ static int mlxbf_tmfifo_virtio_find_vqs(struct virtio_device *vdev, goto error; } + vq->num_max = vring->num; + vqs[i] = vq; vring->vq = vq; vq->priv = vring; diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 89832399e028..e5279ed9a8d7 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -335,7 +335,7 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i) size_t size; /* actual size of vring (in bytes) */ - size = PAGE_ALIGN(vring_size(rvring->len, rvring->align)); + size = PAGE_ALIGN(vring_size(rvring->num, rvring->align)); rsc = (void *)rproc->table_ptr + rvdev->rsc_offset; @@ -402,7 +402,7 @@ rproc_parse_vring(struct rproc_vdev *rvdev, struct fw_rsc_vdev *rsc, int i) return -EINVAL; } - rvring->len = vring->num; + rvring->num = vring->num; rvring->align = vring->align; rvring->rvdev = rvdev; diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c index 70ab496d0431..81c4f5776109 100644 --- a/drivers/remoteproc/remoteproc_virtio.c +++ b/drivers/remoteproc/remoteproc_virtio.c @@ -87,7 +87,7 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev, struct fw_rsc_vdev *rsc; struct virtqueue *vq; void *addr; - int len, size; + int num, size; /* we're temporarily limited to two virtqueues per rvdev */ if (id >= ARRAY_SIZE(rvdev->vring)) @@ -104,20 +104,20 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev, rvring = &rvdev->vring[id]; addr = mem->va; - len = rvring->len; + num = rvring->num; /* zero vring */ - size = vring_size(len, rvring->align); + size = vring_size(num, rvring->align); memset(addr, 0, size); dev_dbg(dev, "vring%d: va %pK qsz %d notifyid %d\n", - id, addr, len, rvring->notifyid); + id, addr, num, rvring->notifyid); /* * Create the new vq, and tell virtio we're not interested in * the 'weak' smp barriers, since we're talking with a real device. */ - vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, ctx, + vq = vring_new_virtqueue(id, num, rvring->align, vdev, false, ctx, addr, rproc_virtio_notify, callback, name); if (!vq) { dev_err(dev, "vring_new_virtqueue %s failed\n", name); @@ -125,6 +125,8 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev, return ERR_PTR(-ENOMEM); } + vq->num_max = num; + rvring->vq = vq; vq->priv = rvring; @@ -156,6 +158,7 @@ static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], + u32 sizes[], const bool * ctx, struct irq_affinity *desc) { diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index aa96f67dd0b1..896896e32664 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -532,6 +532,9 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev, err = -ENOMEM; goto out_err; } + + vq->num_max = info->num; + /* it may have been reduced */ info->num = virtqueue_get_vring_size(vq); @@ -634,6 +637,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], + u32 sizes[], const bool *ctx, struct irq_affinity *desc) { diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c index 48c4dadb0c7c..75a703b803a2 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.c +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -29,7 +29,6 @@ u16 ifcvf_set_config_vector(struct ifcvf_hw *hw, int vector) { struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; - cfg = hw->common_cfg; vp_iowrite16(vector, &cfg->msix_config); return vp_ioread16(&cfg->msix_config); @@ -128,6 +127,7 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev) break; case VIRTIO_PCI_CAP_DEVICE_CFG: hw->dev_cfg = get_cap_addr(hw, &cap); + hw->cap_dev_config_size = le32_to_cpu(cap.length); IFCVF_DBG(pdev, "hw->dev_cfg = %p\n", hw->dev_cfg); break; } @@ -233,15 +233,23 @@ int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features) u32 ifcvf_get_config_size(struct ifcvf_hw *hw) { struct ifcvf_adapter *adapter; + u32 net_config_size = sizeof(struct virtio_net_config); + u32 blk_config_size = sizeof(struct virtio_blk_config); + u32 cap_size = hw->cap_dev_config_size; u32 config_size; adapter = vf_to_adapter(hw); + /* If the onboard device config space size is greater than + * the size of struct virtio_net/blk_config, only the spec + * implementing contents size is returned, this is very + * unlikely, defensive programming. + */ switch (hw->dev_type) { case VIRTIO_ID_NET: - config_size = sizeof(struct virtio_net_config); + config_size = min(cap_size, net_config_size); break; case VIRTIO_ID_BLOCK: - config_size = sizeof(struct virtio_blk_config); + config_size = min(cap_size, blk_config_size); break; default: config_size = 0; diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 115b61f4924b..f5563f665cc6 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -87,6 +87,8 @@ struct ifcvf_hw { int config_irq; int vqs_reused_irq; u16 nr_vring; + /* VIRTIO_PCI_CAP_DEVICE_CFG size */ + u32 cap_dev_config_size; }; struct ifcvf_adapter { diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 0a5670729412..f9c0044c6442 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -685,7 +685,7 @@ static struct vdpa_notification_area ifcvf_get_vq_notification(struct vdpa_devic } /* - * IFCVF currently does't have on-chip IOMMU, so not + * IFCVF currently doesn't have on-chip IOMMU, so not * implemented set_map()/dma_map()/dma_unmap() */ static const struct vdpa_config_ops ifc_vdpa_ops = { @@ -752,59 +752,36 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, { struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; struct ifcvf_adapter *adapter; + struct vdpa_device *vdpa_dev; struct pci_dev *pdev; struct ifcvf_hw *vf; - struct device *dev; - int ret, i; + int ret; ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev); - if (ifcvf_mgmt_dev->adapter) + if (!ifcvf_mgmt_dev->adapter) return -EOPNOTSUPP; - pdev = ifcvf_mgmt_dev->pdev; - dev = &pdev->dev; - adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - dev, &ifc_vdpa_ops, 1, 1, name, false); - if (IS_ERR(adapter)) { - IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); - return PTR_ERR(adapter); - } - - ifcvf_mgmt_dev->adapter = adapter; - + adapter = ifcvf_mgmt_dev->adapter; vf = &adapter->vf; - vf->dev_type = get_dev_type(pdev); - vf->base = pcim_iomap_table(pdev); + pdev = adapter->pdev; + vdpa_dev = &adapter->vdpa; - adapter->pdev = pdev; - adapter->vdpa.dma_dev = &pdev->dev; - - ret = ifcvf_init_hw(vf, pdev); - if (ret) { - IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); - goto err; - } - - for (i = 0; i < vf->nr_vring; i++) - vf->vring[i].irq = -EINVAL; - - vf->hw_features = ifcvf_get_hw_features(vf); - vf->config_size = ifcvf_get_config_size(vf); + if (name) + ret = dev_set_name(&vdpa_dev->dev, "%s", name); + else + ret = dev_set_name(&vdpa_dev->dev, "vdpa%u", vdpa_dev->index); - adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev; ret = _vdpa_register_device(&adapter->vdpa, vf->nr_vring); if (ret) { + put_device(&adapter->vdpa.dev); IFCVF_ERR(pdev, "Failed to register to vDPA bus"); - goto err; + return ret; } return 0; - -err: - put_device(&adapter->vdpa.dev); - return ret; } + static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) { struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; @@ -823,61 +800,94 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; struct device *dev = &pdev->dev; + struct ifcvf_adapter *adapter; + struct ifcvf_hw *vf; u32 dev_type; - int ret; - - ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); - if (!ifcvf_mgmt_dev) { - IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); - return -ENOMEM; - } - - dev_type = get_dev_type(pdev); - switch (dev_type) { - case VIRTIO_ID_NET: - ifcvf_mgmt_dev->mdev.id_table = id_table_net; - break; - case VIRTIO_ID_BLOCK: - ifcvf_mgmt_dev->mdev.id_table = id_table_blk; - break; - default: - IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type); - ret = -EOPNOTSUPP; - goto err; - } - - ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; - ifcvf_mgmt_dev->mdev.device = dev; - ifcvf_mgmt_dev->pdev = pdev; + int ret, i; ret = pcim_enable_device(pdev); if (ret) { IFCVF_ERR(pdev, "Failed to enable device\n"); - goto err; + return ret; } - ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4), IFCVF_DRIVER_NAME); if (ret) { IFCVF_ERR(pdev, "Failed to request MMIO region\n"); - goto err; + return ret; } ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); if (ret) { IFCVF_ERR(pdev, "No usable DMA configuration\n"); - goto err; + return ret; } ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev); if (ret) { IFCVF_ERR(pdev, "Failed for adding devres for freeing irq vectors\n"); - goto err; + return ret; } pci_set_master(pdev); + adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, + dev, &ifc_vdpa_ops, 1, 1, NULL, false); + if (IS_ERR(adapter)) { + IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); + return PTR_ERR(adapter); + } + + vf = &adapter->vf; + vf->dev_type = get_dev_type(pdev); + vf->base = pcim_iomap_table(pdev); + + adapter->pdev = pdev; + adapter->vdpa.dma_dev = &pdev->dev; + + ret = ifcvf_init_hw(vf, pdev); + if (ret) { + IFCVF_ERR(pdev, "Failed to init IFCVF hw\n"); + return ret; + } + + for (i = 0; i < vf->nr_vring; i++) + vf->vring[i].irq = -EINVAL; + + vf->hw_features = ifcvf_get_hw_features(vf); + vf->config_size = ifcvf_get_config_size(vf); + + ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); + if (!ifcvf_mgmt_dev) { + IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); + return -ENOMEM; + } + + ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; + ifcvf_mgmt_dev->mdev.device = dev; + ifcvf_mgmt_dev->adapter = adapter; + + dev_type = get_dev_type(pdev); + switch (dev_type) { + case VIRTIO_ID_NET: + ifcvf_mgmt_dev->mdev.id_table = id_table_net; + break; + case VIRTIO_ID_BLOCK: + ifcvf_mgmt_dev->mdev.id_table = id_table_blk; + break; + default: + IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type); + ret = -EOPNOTSUPP; + goto err; + } + + ifcvf_mgmt_dev->mdev.max_supported_vqs = vf->nr_vring; + ifcvf_mgmt_dev->mdev.supported_features = vf->hw_features; + + adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev; + + ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev); if (ret) { IFCVF_ERR(pdev, diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 44104093163b..6af9fdbb86b7 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -70,6 +70,16 @@ struct mlx5_vdpa_wq_ent { struct mlx5_vdpa_dev *mvdev; }; +enum { + MLX5_VDPA_DATAVQ_GROUP, + MLX5_VDPA_CVQ_GROUP, + MLX5_VDPA_NUMVQ_GROUPS +}; + +enum { + MLX5_VDPA_NUM_AS = MLX5_VDPA_NUMVQ_GROUPS +}; + struct mlx5_vdpa_dev { struct vdpa_device vdev; struct mlx5_core_dev *mdev; @@ -85,6 +95,7 @@ struct mlx5_vdpa_dev { struct mlx5_vdpa_mr mr; struct mlx5_control_vq cvq; struct workqueue_struct *wq; + unsigned int group2asid[MLX5_VDPA_NUMVQ_GROUPS]; }; int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index e85c1d71f4ed..ed100a35e596 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -164,6 +164,7 @@ struct mlx5_vdpa_net { bool setup; u32 cur_num_vqs; u32 rqt_size; + bool nb_registered; struct notifier_block nb; struct vdpa_callback config_cb; struct mlx5_vdpa_wq_ent cvq_ent; @@ -895,6 +896,7 @@ static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque if (err) goto err_cmd; + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT; kfree(in); mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); @@ -922,6 +924,7 @@ static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtq mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id); return; } + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; umems_destroy(ndev, mvq); } @@ -1121,6 +1124,20 @@ err_cmd: return err; } +static bool is_valid_state_change(int oldstate, int newstate) +{ + switch (oldstate) { + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT: + return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY; + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY: + return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND; + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND: + case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR: + default: + return false; + } +} + static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state) { int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in); @@ -1130,6 +1147,12 @@ static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtque void *in; int err; + if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE) + return 0; + + if (!is_valid_state_change(mvq->fw_state, state)) + return -EINVAL; + in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; @@ -1440,7 +1463,7 @@ static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac, headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers); dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16); dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16); - memset(dmac_c, 0xff, ETH_ALEN); + eth_broadcast_addr(dmac_c); ether_addr_copy(dmac_v, mac); MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1); if (tagged) { @@ -1992,6 +2015,7 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct mlx5_vdpa_virtqueue *mvq; + int err; if (!mvdev->actual_features) return; @@ -2005,8 +2029,16 @@ static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready } mvq = &ndev->vqs[idx]; - if (!ready) + if (!ready) { suspend_vq(ndev, mvq); + } else { + err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY); + if (err) { + mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err); + ready = false; + } + } + mvq->ready = ready; } @@ -2095,9 +2127,14 @@ static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev) return PAGE_SIZE; } -static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx) +static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx) { - return 0; + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + if (is_ctrl_vq_idx(mvdev, idx)) + return MLX5_VDPA_CVQ_GROUP; + + return MLX5_VDPA_DATAVQ_GROUP; } enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9, @@ -2511,6 +2548,15 @@ err_clear: up_write(&ndev->reslock); } +static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev) +{ + int i; + + /* default mapping all groups are mapped to asid 0 */ + for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++) + mvdev->group2asid[i] = 0; +} + static int mlx5_vdpa_reset(struct vdpa_device *vdev) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); @@ -2529,7 +2575,9 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) ndev->mvdev.cvq.completed_desc = 0; memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1)); ndev->mvdev.actual_features = 0; + init_group_to_asid_map(mvdev); ++mvdev->generation; + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { if (mlx5_vdpa_create_mr(mvdev, NULL)) mlx5_vdpa_warn(mvdev, "create MR failed\n"); @@ -2567,26 +2615,63 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) return mvdev->generation; } -static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, - struct vhost_iotlb *iotlb) +static int set_map_control(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +{ + u64 start = 0ULL, last = 0ULL - 1; + struct vhost_iotlb_map *map; + int err = 0; + + spin_lock(&mvdev->cvq.iommu_lock); + vhost_iotlb_reset(mvdev->cvq.iotlb); + + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, + map->last, map->addr, map->perm); + if (err) + goto out; + } + +out: + spin_unlock(&mvdev->cvq.iommu_lock); + return err; +} + +static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) { - struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); - struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); bool change_map; int err; - down_write(&ndev->reslock); - err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map); if (err) { mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err); - goto err; + return err; } if (change_map) err = mlx5_vdpa_change_map(mvdev, iotlb); -err: + return err; +} + +static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, + struct vhost_iotlb *iotlb) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err = -EINVAL; + + down_write(&ndev->reslock); + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { + err = set_map_data(mvdev, iotlb); + if (err) + goto out; + } + + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) + err = set_map_control(mvdev, iotlb); + +out: up_write(&ndev->reslock); return err; } @@ -2733,6 +2818,49 @@ out_err: return err; } +static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev) +{ + struct mlx5_control_vq *cvq; + + if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) + return; + + cvq = &mvdev->cvq; + cvq->ready = false; +} + +static int mlx5_vdpa_suspend(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_vdpa_virtqueue *mvq; + int i; + + down_write(&ndev->reslock); + mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); + ndev->nb_registered = false; + flush_workqueue(ndev->mvdev.wq); + for (i = 0; i < ndev->cur_num_vqs; i++) { + mvq = &ndev->vqs[i]; + suspend_vq(ndev, mvq); + } + mlx5_vdpa_cvq_suspend(mvdev); + up_write(&ndev->reslock); + return 0; +} + +static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, + unsigned int asid) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + + if (group >= MLX5_VDPA_NUMVQ_GROUPS) + return -EINVAL; + + mvdev->group2asid[group] = asid; + return 0; +} + static const struct vdpa_config_ops mlx5_vdpa_ops = { .set_vq_address = mlx5_vdpa_set_vq_address, .set_vq_num = mlx5_vdpa_set_vq_num, @@ -2762,7 +2890,9 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .set_config = mlx5_vdpa_set_config, .get_generation = mlx5_vdpa_get_generation, .set_map = mlx5_vdpa_set_map, + .set_group_asid = mlx5_set_group_asid, .free = mlx5_vdpa_free, + .suspend = mlx5_vdpa_suspend, }; static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu) @@ -2828,6 +2958,7 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev) mvq->index = i; mvq->ndev = ndev; mvq->fwqp.fw = true; + mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE; } for (; i < ndev->mvdev.max_vqs; i++) { mvq = &ndev->vqs[i]; @@ -2902,13 +3033,21 @@ static int event_handler(struct notifier_block *nb, unsigned long event, void *p switch (eqe->sub_type) { case MLX5_PORT_CHANGE_SUBTYPE_DOWN: case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: + down_read(&ndev->reslock); + if (!ndev->nb_registered) { + up_read(&ndev->reslock); + return NOTIFY_DONE; + } wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); - if (!wqent) + if (!wqent) { + up_read(&ndev->reslock); return NOTIFY_DONE; + } wqent->mvdev = &ndev->mvdev; INIT_WORK(&wqent->work, update_carrier); queue_work(ndev->mvdev.wq, &wqent->work); + up_read(&ndev->reslock); ret = NOTIFY_OK; break; default: @@ -2982,7 +3121,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - 1, 1, name, false); + MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); @@ -3062,6 +3201,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, ndev->nb.notifier_call = event_handler; mlx5_notifier_register(mdev, &ndev->nb); + ndev->nb_registered = true; mvdev->vdev.mdev = &mgtdev->mgtdev; err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1); if (err) @@ -3093,7 +3233,10 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct workqueue_struct *wq; - mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); + if (ndev->nb_registered) { + mlx5_notifier_unregister(mvdev->mdev, &ndev->nb); + ndev->nb_registered = false; + } wq = mvdev->wq; mvdev->wq = NULL; destroy_workqueue(wq); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index ebf2f363fbe7..c06c02704461 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -824,11 +824,11 @@ static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *ms config.mac)) return -EMSGSIZE; - val_u16 = le16_to_cpu(config.status); + val_u16 = __virtio16_to_cpu(true, config.status); if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16)) return -EMSGSIZE; - val_u16 = le16_to_cpu(config.mtu); + val_u16 = __virtio16_to_cpu(true, config.mtu); if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16)) return -EMSGSIZE; @@ -846,17 +846,9 @@ vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, { u32 device_id; void *hdr; - u8 status; int err; down_read(&vdev->cf_lock); - status = vdev->config->get_status(vdev); - if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { - NL_SET_ERR_MSG_MOD(extack, "Features negotiation not completed"); - err = -EAGAIN; - goto out; - } - hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_CONFIG_GET); if (!hdr) { @@ -913,7 +905,7 @@ static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, } vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); - max_vqp = le16_to_cpu(config.max_virtqueue_pairs); + max_vqp = __virtio16_to_cpu(true, config.max_virtqueue_pairs); if (nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, max_vqp)) return -EMSGSIZE; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 0f2865899647..225b7f5d8be3 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -33,7 +33,7 @@ MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable"); static int max_iotlb_entries = 2048; module_param(max_iotlb_entries, int, 0444); MODULE_PARM_DESC(max_iotlb_entries, - "Maximum number of iotlb entries. 0 means unlimited. (default: 2048)"); + "Maximum number of iotlb entries for each address space. 0 means unlimited. (default: 2048)"); #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 @@ -107,6 +107,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim) for (i = 0; i < vdpasim->dev_attr.nas; i++) vhost_iotlb_reset(&vdpasim->iommu[i]); + vdpasim->running = true; spin_unlock(&vdpasim->iommu_lock); vdpasim->features = 0; @@ -291,7 +292,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) goto err_iommu; for (i = 0; i < vdpasim->dev_attr.nas; i++) - vhost_iotlb_init(&vdpasim->iommu[i], 0, 0); + vhost_iotlb_init(&vdpasim->iommu[i], max_iotlb_entries, 0); vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL); if (!vdpasim->buffer) @@ -505,6 +506,17 @@ static int vdpasim_reset(struct vdpa_device *vdpa) return 0; } +static int vdpasim_suspend(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + spin_lock(&vdpasim->lock); + vdpasim->running = false; + spin_unlock(&vdpasim->lock); + + return 0; +} + static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -694,6 +706,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, .reset = vdpasim_reset, + .suspend = vdpasim_suspend, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, @@ -726,6 +739,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, .reset = vdpasim_reset, + .suspend = vdpasim_suspend, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 622782e92239..061986f30911 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -66,6 +66,7 @@ struct vdpasim { u32 generation; u64 features; u32 groups; + bool running; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 42d401d43911..c8bfea3b7db2 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -25,31 +25,49 @@ #define DRV_LICENSE "GPL v2" #define VDPASIM_BLK_FEATURES (VDPASIM_FEATURES | \ + (1ULL << VIRTIO_BLK_F_FLUSH) | \ (1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ - (1ULL << VIRTIO_BLK_F_MQ)) + (1ULL << VIRTIO_BLK_F_MQ) | \ + (1ULL << VIRTIO_BLK_F_DISCARD) | \ + (1ULL << VIRTIO_BLK_F_WRITE_ZEROES)) #define VDPASIM_BLK_CAPACITY 0x40000 #define VDPASIM_BLK_SIZE_MAX 0x1000 #define VDPASIM_BLK_SEG_MAX 32 +#define VDPASIM_BLK_DWZ_MAX_SECTORS UINT_MAX + +/* 1 virtqueue, 1 address space, 1 virtqueue group */ #define VDPASIM_BLK_VQ_NUM 1 +#define VDPASIM_BLK_AS_NUM 1 +#define VDPASIM_BLK_GROUP_NUM 1 static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; -static bool vdpasim_blk_check_range(u64 start_sector, size_t range_size) +static bool vdpasim_blk_check_range(struct vdpasim *vdpasim, u64 start_sector, + u64 num_sectors, u64 max_sectors) { - u64 range_sectors = range_size >> SECTOR_SHIFT; - - if (range_size > VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX) - return false; + if (start_sector > VDPASIM_BLK_CAPACITY) { + dev_dbg(&vdpasim->vdpa.dev, + "starting sector exceeds the capacity - start: 0x%llx capacity: 0x%x\n", + start_sector, VDPASIM_BLK_CAPACITY); + } - if (start_sector > VDPASIM_BLK_CAPACITY) + if (num_sectors > max_sectors) { + dev_dbg(&vdpasim->vdpa.dev, + "number of sectors exceeds the max allowed in a request - num: 0x%llx max: 0x%llx\n", + num_sectors, max_sectors); return false; + } - if (range_sectors > VDPASIM_BLK_CAPACITY - start_sector) + if (num_sectors > VDPASIM_BLK_CAPACITY - start_sector) { + dev_dbg(&vdpasim->vdpa.dev, + "request exceeds the capacity - start: 0x%llx num: 0x%llx capacity: 0x%x\n", + start_sector, num_sectors, VDPASIM_BLK_CAPACITY); return false; + } return true; } @@ -63,6 +81,7 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, { size_t pushed = 0, to_pull, to_push; struct virtio_blk_outhdr hdr; + bool handled = false; ssize_t bytes; loff_t offset; u64 sector; @@ -76,14 +95,14 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, return false; if (vq->out_iov.used < 1 || vq->in_iov.used < 1) { - dev_err(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n", + dev_dbg(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n", vq->out_iov.used, vq->in_iov.used); - return false; + goto err; } if (vq->in_iov.iov[vq->in_iov.used - 1].iov_len < 1) { - dev_err(&vdpasim->vdpa.dev, "request in header too short\n"); - return false; + dev_dbg(&vdpasim->vdpa.dev, "request in header too short\n"); + goto err; } /* The last byte is the status and we checked if the last iov has @@ -96,8 +115,8 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &hdr, sizeof(hdr)); if (bytes != sizeof(hdr)) { - dev_err(&vdpasim->vdpa.dev, "request out header too short\n"); - return false; + dev_dbg(&vdpasim->vdpa.dev, "request out header too short\n"); + goto err; } to_pull -= bytes; @@ -107,12 +126,20 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, offset = sector << SECTOR_SHIFT; status = VIRTIO_BLK_S_OK; + if (type != VIRTIO_BLK_T_IN && type != VIRTIO_BLK_T_OUT && + sector != 0) { + dev_dbg(&vdpasim->vdpa.dev, + "sector must be 0 for %u request - sector: 0x%llx\n", + type, sector); + status = VIRTIO_BLK_S_IOERR; + goto err_status; + } + switch (type) { case VIRTIO_BLK_T_IN: - if (!vdpasim_blk_check_range(sector, to_push)) { - dev_err(&vdpasim->vdpa.dev, - "reading over the capacity - offset: 0x%llx len: 0x%zx\n", - offset, to_push); + if (!vdpasim_blk_check_range(vdpasim, sector, + to_push >> SECTOR_SHIFT, + VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { status = VIRTIO_BLK_S_IOERR; break; } @@ -121,7 +148,7 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, vdpasim->buffer + offset, to_push); if (bytes < 0) { - dev_err(&vdpasim->vdpa.dev, + dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", bytes, offset, to_push); status = VIRTIO_BLK_S_IOERR; @@ -132,10 +159,9 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, break; case VIRTIO_BLK_T_OUT: - if (!vdpasim_blk_check_range(sector, to_pull)) { - dev_err(&vdpasim->vdpa.dev, - "writing over the capacity - offset: 0x%llx len: 0x%zx\n", - offset, to_pull); + if (!vdpasim_blk_check_range(vdpasim, sector, + to_pull >> SECTOR_SHIFT, + VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX)) { status = VIRTIO_BLK_S_IOERR; break; } @@ -144,7 +170,7 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, vdpasim->buffer + offset, to_pull); if (bytes < 0) { - dev_err(&vdpasim->vdpa.dev, + dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", bytes, offset, to_pull); status = VIRTIO_BLK_S_IOERR; @@ -157,7 +183,7 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, vdpasim_blk_id, VIRTIO_BLK_ID_BYTES); if (bytes < 0) { - dev_err(&vdpasim->vdpa.dev, + dev_dbg(&vdpasim->vdpa.dev, "vringh_iov_push_iotlb() error: %zd\n", bytes); status = VIRTIO_BLK_S_IOERR; break; @@ -166,13 +192,76 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, pushed += bytes; break; + case VIRTIO_BLK_T_FLUSH: + /* nothing to do */ + break; + + case VIRTIO_BLK_T_DISCARD: + case VIRTIO_BLK_T_WRITE_ZEROES: { + struct virtio_blk_discard_write_zeroes range; + u32 num_sectors, flags; + + if (to_pull != sizeof(range)) { + dev_dbg(&vdpasim->vdpa.dev, + "discard/write_zeroes header len: 0x%zx [expected: 0x%zx]\n", + to_pull, sizeof(range)); + status = VIRTIO_BLK_S_IOERR; + break; + } + + bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &range, + to_pull); + if (bytes < 0) { + dev_dbg(&vdpasim->vdpa.dev, + "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", + bytes, offset, to_pull); + status = VIRTIO_BLK_S_IOERR; + break; + } + + sector = le64_to_cpu(range.sector); + offset = sector << SECTOR_SHIFT; + num_sectors = le32_to_cpu(range.num_sectors); + flags = le32_to_cpu(range.flags); + + if (type == VIRTIO_BLK_T_DISCARD && flags != 0) { + dev_dbg(&vdpasim->vdpa.dev, + "discard unexpected flags set - flags: 0x%x\n", + flags); + status = VIRTIO_BLK_S_UNSUPP; + break; + } + + if (type == VIRTIO_BLK_T_WRITE_ZEROES && + flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { + dev_dbg(&vdpasim->vdpa.dev, + "write_zeroes unexpected flags set - flags: 0x%x\n", + flags); + status = VIRTIO_BLK_S_UNSUPP; + break; + } + + if (!vdpasim_blk_check_range(vdpasim, sector, num_sectors, + VDPASIM_BLK_DWZ_MAX_SECTORS)) { + status = VIRTIO_BLK_S_IOERR; + break; + } + + if (type == VIRTIO_BLK_T_WRITE_ZEROES) { + memset(vdpasim->buffer + offset, 0, + num_sectors << SECTOR_SHIFT); + } + + break; + } default: - dev_warn(&vdpasim->vdpa.dev, - "Unsupported request type %d\n", type); + dev_dbg(&vdpasim->vdpa.dev, + "Unsupported request type %d\n", type); status = VIRTIO_BLK_S_IOERR; break; } +err_status: /* If some operations fail, we need to skip the remaining bytes * to put the status in the last byte */ @@ -182,21 +271,25 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, /* Last byte is the status */ bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, &status, 1); if (bytes != 1) - return false; + goto err; pushed += bytes; /* Make sure data is wrote before advancing index */ smp_wmb(); + handled = true; + +err: vringh_complete_iotlb(&vq->vring, vq->head, pushed); - return true; + return handled; } static void vdpasim_blk_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + bool reschedule = false; int i; spin_lock(&vdpasim->lock); @@ -204,8 +297,12 @@ static void vdpasim_blk_work(struct work_struct *work) if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; + if (!vdpasim->running) + goto out; + for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[i]; + int reqs = 0; if (!vq->ready) continue; @@ -218,10 +315,18 @@ static void vdpasim_blk_work(struct work_struct *work) if (vringh_need_notify_iotlb(&vq->vring) > 0) vringh_notify(&vq->vring); local_bh_enable(); + + if (++reqs > 4) { + reschedule = true; + break; + } } } out: spin_unlock(&vdpasim->lock); + + if (reschedule) + schedule_work(&vdpasim->work); } static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) @@ -237,6 +342,17 @@ static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) blk_config->min_io_size = cpu_to_vdpasim16(vdpasim, 1); blk_config->opt_io_size = cpu_to_vdpasim32(vdpasim, 1); blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); + /* VIRTIO_BLK_F_DISCARD */ + blk_config->discard_sector_alignment = + cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); + blk_config->max_discard_sectors = + cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); + blk_config->max_discard_seg = cpu_to_vdpasim32(vdpasim, 1); + /* VIRTIO_BLK_F_WRITE_ZEROES */ + blk_config->max_write_zeroes_sectors = + cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_DWZ_MAX_SECTORS); + blk_config->max_write_zeroes_seg = cpu_to_vdpasim32(vdpasim, 1); + } static void vdpasim_blk_mgmtdev_release(struct device *dev) @@ -260,6 +376,8 @@ static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, dev_attr.id = VIRTIO_ID_BLOCK; dev_attr.supported_features = VDPASIM_BLK_FEATURES; dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; + dev_attr.ngroups = VDPASIM_BLK_GROUP_NUM; + dev_attr.nas = VDPASIM_BLK_AS_NUM; dev_attr.config_size = sizeof(struct virtio_blk_config); dev_attr.get_config = vdpasim_blk_get_config; dev_attr.work_fn = vdpasim_blk_work; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index 5125976a4df8..886449e88502 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -154,6 +154,9 @@ static void vdpasim_net_work(struct work_struct *work) spin_lock(&vdpasim->lock); + if (!vdpasim->running) + goto out; + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 6daa3978d290..e682bc7ee6c9 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -138,18 +138,17 @@ static void do_bounce(phys_addr_t orig, void *addr, size_t size, { unsigned long pfn = PFN_DOWN(orig); unsigned int offset = offset_in_page(orig); - char *buffer; + struct page *page; unsigned int sz = 0; while (size) { sz = min_t(size_t, PAGE_SIZE - offset, size); - buffer = kmap_atomic(pfn_to_page(pfn)); + page = pfn_to_page(pfn); if (dir == DMA_TO_DEVICE) - memcpy(addr, buffer + offset, sz); + memcpy_from_page(addr, page, offset, sz); else - memcpy(buffer + offset, addr, sz); - kunmap_atomic(buffer); + memcpy_to_page(page, offset, addr, sz); size -= sz; pfn++; @@ -179,8 +178,9 @@ static void vduse_domain_bounce(struct vduse_iova_domain *domain, map->orig_phys == INVALID_PHYS_ADDR)) return; - addr = page_address(map->bounce_page) + offset; - do_bounce(map->orig_phys + offset, addr, sz, dir); + addr = kmap_local_page(map->bounce_page); + do_bounce(map->orig_phys + offset, addr + offset, sz, dir); + kunmap_local(addr); size -= sz; iova += sz; } @@ -213,21 +213,21 @@ vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova) struct vduse_bounce_map *map; struct page *page = NULL; - spin_lock(&domain->iotlb_lock); + read_lock(&domain->bounce_lock); map = &domain->bounce_maps[iova >> PAGE_SHIFT]; - if (!map->bounce_page) + if (domain->user_bounce_pages || !map->bounce_page) goto out; page = map->bounce_page; get_page(page); out: - spin_unlock(&domain->iotlb_lock); + read_unlock(&domain->bounce_lock); return page; } static void -vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain) +vduse_domain_free_kernel_bounce_pages(struct vduse_iova_domain *domain) { struct vduse_bounce_map *map; unsigned long pfn, bounce_pfns; @@ -247,6 +247,73 @@ vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain) } } +int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, + struct page **pages, int count) +{ + struct vduse_bounce_map *map; + int i, ret; + + /* Now we don't support partial mapping */ + if (count != (domain->bounce_size >> PAGE_SHIFT)) + return -EINVAL; + + write_lock(&domain->bounce_lock); + ret = -EEXIST; + if (domain->user_bounce_pages) + goto out; + + for (i = 0; i < count; i++) { + map = &domain->bounce_maps[i]; + if (map->bounce_page) { + /* Copy kernel page to user page if it's in use */ + if (map->orig_phys != INVALID_PHYS_ADDR) + memcpy_to_page(pages[i], 0, + page_address(map->bounce_page), + PAGE_SIZE); + __free_page(map->bounce_page); + } + map->bounce_page = pages[i]; + get_page(pages[i]); + } + domain->user_bounce_pages = true; + ret = 0; +out: + write_unlock(&domain->bounce_lock); + + return ret; +} + +void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain) +{ + struct vduse_bounce_map *map; + unsigned long i, count; + + write_lock(&domain->bounce_lock); + if (!domain->user_bounce_pages) + goto out; + + count = domain->bounce_size >> PAGE_SHIFT; + for (i = 0; i < count; i++) { + struct page *page = NULL; + + map = &domain->bounce_maps[i]; + if (WARN_ON(!map->bounce_page)) + continue; + + /* Copy user page to kernel page if it's in use */ + if (map->orig_phys != INVALID_PHYS_ADDR) { + page = alloc_page(GFP_ATOMIC | __GFP_NOFAIL); + memcpy_from_page(page_address(page), + map->bounce_page, 0, PAGE_SIZE); + } + put_page(map->bounce_page); + map->bounce_page = page; + } + domain->user_bounce_pages = false; +out: + write_unlock(&domain->bounce_lock); +} + void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain) { if (!domain->bounce_map) @@ -322,13 +389,18 @@ dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, if (vduse_domain_init_bounce_map(domain)) goto err; + read_lock(&domain->bounce_lock); if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa)) - goto err; + goto err_unlock; if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE); + read_unlock(&domain->bounce_lock); + return iova; +err_unlock: + read_unlock(&domain->bounce_lock); err: vduse_domain_free_iova(iovad, iova, size); return DMA_MAPPING_ERROR; @@ -340,10 +412,12 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, { struct iova_domain *iovad = &domain->stream_iovad; + read_lock(&domain->bounce_lock); if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE); vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size); + read_unlock(&domain->bounce_lock); vduse_domain_free_iova(iovad, dma_addr, size); } @@ -451,7 +525,8 @@ static int vduse_domain_release(struct inode *inode, struct file *file) spin_lock(&domain->iotlb_lock); vduse_iotlb_del_range(domain, 0, ULLONG_MAX); - vduse_domain_free_bounce_pages(domain); + vduse_domain_remove_user_bounce_pages(domain); + vduse_domain_free_kernel_bounce_pages(domain); spin_unlock(&domain->iotlb_lock); put_iova_domain(&domain->stream_iovad); put_iova_domain(&domain->consistent_iovad); @@ -511,6 +586,7 @@ vduse_domain_create(unsigned long iova_limit, size_t bounce_size) goto err_file; domain->file = file; + rwlock_init(&domain->bounce_lock); spin_lock_init(&domain->iotlb_lock); init_iova_domain(&domain->stream_iovad, PAGE_SIZE, IOVA_START_PFN); diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 2722d9b8e21a..4e0e50e7ac15 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -14,6 +14,7 @@ #include <linux/iova.h> #include <linux/dma-mapping.h> #include <linux/vhost_iotlb.h> +#include <linux/rwlock.h> #define IOVA_START_PFN 1 @@ -34,6 +35,8 @@ struct vduse_iova_domain { struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; struct file *file; + bool user_bounce_pages; + rwlock_t bounce_lock; }; int vduse_domain_set_map(struct vduse_iova_domain *domain, @@ -61,6 +64,11 @@ void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain); +int vduse_domain_add_user_bounce_pages(struct vduse_iova_domain *domain, + struct page **pages, int count); + +void vduse_domain_remove_user_bounce_pages(struct vduse_iova_domain *domain); + void vduse_domain_destroy(struct vduse_iova_domain *domain); struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit, diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 3bc27de58f46..41c0b29739f1 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -21,6 +21,8 @@ #include <linux/uio.h> #include <linux/vdpa.h> #include <linux/nospec.h> +#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include <uapi/linux/vduse.h> #include <uapi/linux/vdpa.h> #include <uapi/linux/virtio_config.h> @@ -64,6 +66,13 @@ struct vduse_vdpa { struct vduse_dev *dev; }; +struct vduse_umem { + unsigned long iova; + unsigned long npages; + struct page **pages; + struct mm_struct *mm; +}; + struct vduse_dev { struct vduse_vdpa *vdev; struct device *dev; @@ -95,6 +104,8 @@ struct vduse_dev { u8 status; u32 vq_num; u32 vq_align; + struct vduse_umem *umem; + struct mutex mem_lock; }; struct vduse_dev_msg { @@ -917,6 +928,102 @@ unlock: return ret; } +static int vduse_dev_dereg_umem(struct vduse_dev *dev, + u64 iova, u64 size) +{ + int ret; + + mutex_lock(&dev->mem_lock); + ret = -ENOENT; + if (!dev->umem) + goto unlock; + + ret = -EINVAL; + if (dev->umem->iova != iova || size != dev->domain->bounce_size) + goto unlock; + + vduse_domain_remove_user_bounce_pages(dev->domain); + unpin_user_pages_dirty_lock(dev->umem->pages, + dev->umem->npages, true); + atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm); + mmdrop(dev->umem->mm); + vfree(dev->umem->pages); + kfree(dev->umem); + dev->umem = NULL; + ret = 0; +unlock: + mutex_unlock(&dev->mem_lock); + return ret; +} + +static int vduse_dev_reg_umem(struct vduse_dev *dev, + u64 iova, u64 uaddr, u64 size) +{ + struct page **page_list = NULL; + struct vduse_umem *umem = NULL; + long pinned = 0; + unsigned long npages, lock_limit; + int ret; + + if (!dev->domain->bounce_map || + size != dev->domain->bounce_size || + iova != 0 || uaddr & ~PAGE_MASK) + return -EINVAL; + + mutex_lock(&dev->mem_lock); + ret = -EEXIST; + if (dev->umem) + goto unlock; + + ret = -ENOMEM; + npages = size >> PAGE_SHIFT; + page_list = __vmalloc(array_size(npages, sizeof(struct page *)), + GFP_KERNEL_ACCOUNT); + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!page_list || !umem) + goto unlock; + + mmap_read_lock(current->mm); + + lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK)); + if (npages + atomic64_read(¤t->mm->pinned_vm) > lock_limit) + goto out; + + pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE, + page_list, NULL); + if (pinned != npages) { + ret = pinned < 0 ? pinned : -ENOMEM; + goto out; + } + + ret = vduse_domain_add_user_bounce_pages(dev->domain, + page_list, pinned); + if (ret) + goto out; + + atomic64_add(npages, ¤t->mm->pinned_vm); + + umem->pages = page_list; + umem->npages = pinned; + umem->iova = iova; + umem->mm = current->mm; + mmgrab(current->mm); + + dev->umem = umem; +out: + if (ret && pinned > 0) + unpin_user_pages(page_list, pinned); + + mmap_read_unlock(current->mm); +unlock: + if (ret) { + vfree(page_list); + kfree(umem); + } + mutex_unlock(&dev->mem_lock); + return ret; +} + static long vduse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1089,6 +1196,77 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject); break; } + case VDUSE_IOTLB_REG_UMEM: { + struct vduse_iova_umem umem; + + ret = -EFAULT; + if (copy_from_user(&umem, argp, sizeof(umem))) + break; + + ret = -EINVAL; + if (!is_mem_zero((const char *)umem.reserved, + sizeof(umem.reserved))) + break; + + ret = vduse_dev_reg_umem(dev, umem.iova, + umem.uaddr, umem.size); + break; + } + case VDUSE_IOTLB_DEREG_UMEM: { + struct vduse_iova_umem umem; + + ret = -EFAULT; + if (copy_from_user(&umem, argp, sizeof(umem))) + break; + + ret = -EINVAL; + if (!is_mem_zero((const char *)umem.reserved, + sizeof(umem.reserved))) + break; + + ret = vduse_dev_dereg_umem(dev, umem.iova, + umem.size); + break; + } + case VDUSE_IOTLB_GET_INFO: { + struct vduse_iova_info info; + struct vhost_iotlb_map *map; + struct vduse_iova_domain *domain = dev->domain; + + ret = -EFAULT; + if (copy_from_user(&info, argp, sizeof(info))) + break; + + ret = -EINVAL; + if (info.start > info.last) + break; + + if (!is_mem_zero((const char *)info.reserved, + sizeof(info.reserved))) + break; + + spin_lock(&domain->iotlb_lock); + map = vhost_iotlb_itree_first(domain->iotlb, + info.start, info.last); + if (map) { + info.start = map->start; + info.last = map->last; + info.capability = 0; + if (domain->bounce_map && map->start == 0 && + map->last == domain->bounce_size - 1) + info.capability |= VDUSE_IOVA_CAP_UMEM; + } + spin_unlock(&domain->iotlb_lock); + if (!map) + break; + + ret = -EFAULT; + if (copy_to_user(argp, &info, sizeof(info))) + break; + + ret = 0; + break; + } default: ret = -ENOIOCTLCMD; break; @@ -1101,6 +1279,7 @@ static int vduse_dev_release(struct inode *inode, struct file *file) { struct vduse_dev *dev = file->private_data; + vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); spin_lock(&dev->msg_lock); /* Make sure the inflight messages can processed after reconncection */ list_splice_init(&dev->recv_list, &dev->send_list); @@ -1163,6 +1342,7 @@ static struct vduse_dev *vduse_dev_create(void) return NULL; mutex_init(&dev->lock); + mutex_init(&dev->mem_lock); spin_lock_init(&dev->msg_lock); INIT_LIST_HEAD(&dev->send_list); INIT_LIST_HEAD(&dev->recv_list); diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 9b65509424dc..7ebf106d50c1 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -159,9 +159,13 @@ enum { }; #define VHOST_SCSI_MAX_TARGET 256 -#define VHOST_SCSI_MAX_VQ 128 +#define VHOST_SCSI_MAX_IO_VQ 1024 #define VHOST_SCSI_MAX_EVENT 128 +static unsigned vhost_scsi_max_io_vqs = 128; +module_param_named(max_io_vqs, vhost_scsi_max_io_vqs, uint, 0644); +MODULE_PARM_DESC(max_io_vqs, "Set the max number of IO virtqueues a vhost scsi device can support. The default is 128. The max is 1024."); + struct vhost_scsi_virtqueue { struct vhost_virtqueue vq; /* @@ -186,7 +190,9 @@ struct vhost_scsi { char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; struct vhost_dev dev; - struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ]; + struct vhost_scsi_virtqueue *vqs; + unsigned long *compl_bitmap; + struct vhost_scsi_inflight **old_inflight; struct vhost_work vs_completion_work; /* cmd completion work item */ struct llist_head vs_completion_list; /* cmd completion queue */ @@ -245,7 +251,7 @@ static void vhost_scsi_init_inflight(struct vhost_scsi *vs, struct vhost_virtqueue *vq; int idx, i; - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + for (i = 0; i < vs->dev.nvqs; i++) { vq = &vs->vqs[i].vq; mutex_lock(&vq->mutex); @@ -533,7 +539,6 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) { struct vhost_scsi *vs = container_of(work, struct vhost_scsi, vs_completion_work); - DECLARE_BITMAP(signal, VHOST_SCSI_MAX_VQ); struct virtio_scsi_cmd_resp v_rsp; struct vhost_scsi_cmd *cmd, *t; struct llist_node *llnode; @@ -541,7 +546,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) struct iov_iter iov_iter; int ret, vq; - bitmap_zero(signal, VHOST_SCSI_MAX_VQ); + bitmap_zero(vs->compl_bitmap, vs->dev.nvqs); llnode = llist_del_all(&vs->vs_completion_list); llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) { se_cmd = &cmd->tvc_se_cmd; @@ -566,7 +571,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0); q = container_of(cmd->tvc_vq, struct vhost_scsi_virtqueue, vq); vq = q - vs->vqs; - __set_bit(vq, signal); + __set_bit(vq, vs->compl_bitmap); } else pr_err("Faulted on virtio_scsi_cmd_resp\n"); @@ -574,8 +579,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) } vq = -1; - while ((vq = find_next_bit(signal, VHOST_SCSI_MAX_VQ, vq + 1)) - < VHOST_SCSI_MAX_VQ) + while ((vq = find_next_bit(vs->compl_bitmap, vs->dev.nvqs, vq + 1)) + < vs->dev.nvqs) vhost_signal(&vs->dev, &vs->vqs[vq].vq); } @@ -1419,26 +1424,25 @@ static void vhost_scsi_handle_kick(struct vhost_work *work) /* Callers must hold dev mutex */ static void vhost_scsi_flush(struct vhost_scsi *vs) { - struct vhost_scsi_inflight *old_inflight[VHOST_SCSI_MAX_VQ]; int i; /* Init new inflight and remember the old inflight */ - vhost_scsi_init_inflight(vs, old_inflight); + vhost_scsi_init_inflight(vs, vs->old_inflight); /* * The inflight->kref was initialized to 1. We decrement it here to * indicate the start of the flush operation so that it will reach 0 * when all the reqs are finished. */ - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) - kref_put(&old_inflight[i]->kref, vhost_scsi_done_inflight); + for (i = 0; i < vs->dev.nvqs; i++) + kref_put(&vs->old_inflight[i]->kref, vhost_scsi_done_inflight); /* Flush both the vhost poll and vhost work */ vhost_dev_flush(&vs->dev); /* Wait for all reqs issued before the flush to be finished */ - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) - wait_for_completion(&old_inflight[i]->comp); + for (i = 0; i < vs->dev.nvqs; i++) + wait_for_completion(&vs->old_inflight[i]->comp); } static void vhost_scsi_destroy_vq_cmds(struct vhost_virtqueue *vq) @@ -1601,7 +1605,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn, sizeof(vs->vs_vhost_wwpn)); - for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) { + for (i = VHOST_SCSI_VQ_IO; i < vs->dev.nvqs; i++) { vq = &vs->vqs[i].vq; if (!vhost_vq_is_setup(vq)) continue; @@ -1611,7 +1615,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, goto destroy_vq_cmds; } - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + for (i = 0; i < vs->dev.nvqs; i++) { vq = &vs->vqs[i].vq; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, vs_tpg); @@ -1713,7 +1717,7 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, target_undepend_item(&se_tpg->tpg_group.cg_item); } if (match) { - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + for (i = 0; i < vs->dev.nvqs; i++) { vq = &vs->vqs[i].vq; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); @@ -1722,7 +1726,7 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, /* Make sure cmds are not running before tearing them down. */ vhost_scsi_flush(vs); - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + for (i = 0; i < vs->dev.nvqs; i++) { vq = &vs->vqs[i].vq; vhost_scsi_destroy_vq_cmds(vq); } @@ -1762,7 +1766,7 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features) return -EFAULT; } - for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) { + for (i = 0; i < vs->dev.nvqs; i++) { vq = &vs->vqs[i].vq; mutex_lock(&vq->mutex); vq->acked_features = features; @@ -1776,16 +1780,40 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) { struct vhost_scsi *vs; struct vhost_virtqueue **vqs; - int r = -ENOMEM, i; + int r = -ENOMEM, i, nvqs = vhost_scsi_max_io_vqs; vs = kvzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) goto err_vs; - vqs = kmalloc_array(VHOST_SCSI_MAX_VQ, sizeof(*vqs), GFP_KERNEL); - if (!vqs) + if (nvqs > VHOST_SCSI_MAX_IO_VQ) { + pr_err("Invalid max_io_vqs of %d. Using %d.\n", nvqs, + VHOST_SCSI_MAX_IO_VQ); + nvqs = VHOST_SCSI_MAX_IO_VQ; + } else if (nvqs == 0) { + pr_err("Invalid max_io_vqs of %d. Using 1.\n", nvqs); + nvqs = 1; + } + nvqs += VHOST_SCSI_VQ_IO; + + vs->compl_bitmap = bitmap_alloc(nvqs, GFP_KERNEL); + if (!vs->compl_bitmap) + goto err_compl_bitmap; + + vs->old_inflight = kmalloc_array(nvqs, sizeof(*vs->old_inflight), + GFP_KERNEL | __GFP_ZERO); + if (!vs->old_inflight) + goto err_inflight; + + vs->vqs = kmalloc_array(nvqs, sizeof(*vs->vqs), + GFP_KERNEL | __GFP_ZERO); + if (!vs->vqs) goto err_vqs; + vqs = kmalloc_array(nvqs, sizeof(*vqs), GFP_KERNEL); + if (!vqs) + goto err_local_vqs; + vhost_work_init(&vs->vs_completion_work, vhost_scsi_complete_cmd_work); vhost_work_init(&vs->vs_event_work, vhost_scsi_evt_work); @@ -1796,11 +1824,11 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) vqs[VHOST_SCSI_VQ_EVT] = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; vs->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick; vs->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick; - for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) { + for (i = VHOST_SCSI_VQ_IO; i < nvqs; i++) { vqs[i] = &vs->vqs[i].vq; vs->vqs[i].vq.handle_kick = vhost_scsi_handle_kick; } - vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV, + vhost_dev_init(&vs->dev, vqs, nvqs, UIO_MAXIOV, VHOST_SCSI_WEIGHT, 0, true, NULL); vhost_scsi_init_inflight(vs, NULL); @@ -1808,7 +1836,13 @@ static int vhost_scsi_open(struct inode *inode, struct file *f) f->private_data = vs; return 0; +err_local_vqs: + kfree(vs->vqs); err_vqs: + kfree(vs->old_inflight); +err_inflight: + bitmap_free(vs->compl_bitmap); +err_compl_bitmap: kvfree(vs); err_vs: return r; @@ -1826,6 +1860,9 @@ static int vhost_scsi_release(struct inode *inode, struct file *f) vhost_dev_stop(&vs->dev); vhost_dev_cleanup(&vs->dev); kfree(vs->dev.vqs); + kfree(vs->vqs); + kfree(vs->old_inflight); + bitmap_free(vs->compl_bitmap); kvfree(vs); return 0; } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 23dcbfdfa13b..166044642fd5 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -347,6 +347,14 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, return 0; } +static bool vhost_vdpa_can_suspend(const struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + return ops->suspend; +} + static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) { struct vdpa_device *vdpa = v->vdpa; @@ -470,6 +478,22 @@ static long vhost_vdpa_get_vqs_count(struct vhost_vdpa *v, u32 __user *argp) return 0; } +/* After a successful return of ioctl the device must not process more + * virtqueue descriptors. The device can answer to read or writes of config + * fields as if it were not suspended. In particular, writing to "queue_enable" + * with a value of 1 will not make the device start processing buffers. + */ +static long vhost_vdpa_suspend(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (!ops->suspend) + return -EOPNOTSUPP; + + return ops->suspend(vdpa); +} + static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, void __user *argp) { @@ -577,7 +601,11 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, if (cmd == VHOST_SET_BACKEND_FEATURES) { if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; - if (features & ~VHOST_VDPA_BACKEND_FEATURES) + if (features & ~(VHOST_VDPA_BACKEND_FEATURES | + BIT_ULL(VHOST_BACKEND_F_SUSPEND))) + return -EOPNOTSUPP; + if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) && + !vhost_vdpa_can_suspend(v)) return -EOPNOTSUPP; vhost_set_backend_features(&v->vdev, features); return 0; @@ -628,6 +656,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, break; case VHOST_GET_BACKEND_FEATURES: features = VHOST_VDPA_BACKEND_FEATURES; + if (vhost_vdpa_can_suspend(v)) + features |= BIT_ULL(VHOST_BACKEND_F_SUSPEND); if (copy_to_user(featurep, &features, sizeof(features))) r = -EFAULT; break; @@ -640,6 +670,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, case VHOST_VDPA_GET_VQS_COUNT: r = vhost_vdpa_get_vqs_count(v, argp); break; + case VHOST_VDPA_SUSPEND: + r = vhost_vdpa_suspend(v); + break; default: r = vhost_dev_ioctl(&v->vdev, cmd, argp); if (r == -ENOIOCTLCMD) @@ -1076,7 +1109,7 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) if (!bus) return -EFAULT; - if (!iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) + if (!device_iommu_capable(dma_dev, IOMMU_CAP_CACHE_COHERENCY)) return -ENOTSUPP; v->domain = iommu_domain_alloc(bus); @@ -1363,6 +1396,7 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) err: put_device(&v->dev); + ida_simple_remove(&vhost_vdpa_ida, v->minor); return r; } diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index eab55accf381..11f59dd06a74 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1095,7 +1095,8 @@ EXPORT_SYMBOL(vringh_need_notify_kern); #if IS_REACHABLE(CONFIG_VHOST_IOTLB) static int iotlb_translate(const struct vringh *vrh, - u64 addr, u64 len, struct bio_vec iov[], + u64 addr, u64 len, u64 *translated, + struct bio_vec iov[], int iov_size, u32 perm) { struct vhost_iotlb_map *map; @@ -1136,43 +1137,76 @@ static int iotlb_translate(const struct vringh *vrh, spin_unlock(vrh->iotlb_lock); + if (translated) + *translated = min(len, s); + return ret; } static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { - struct iov_iter iter; - struct bio_vec iov[16]; - int ret; + u64 total_translated = 0; - ret = iotlb_translate(vrh, (u64)(uintptr_t)src, - len, iov, 16, VHOST_MAP_RO); - if (ret < 0) - return ret; + while (total_translated < len) { + struct bio_vec iov[16]; + struct iov_iter iter; + u64 translated; + int ret; - iov_iter_bvec(&iter, READ, iov, ret, len); + ret = iotlb_translate(vrh, (u64)(uintptr_t)src, + len - total_translated, &translated, + iov, ARRAY_SIZE(iov), VHOST_MAP_RO); + if (ret == -ENOBUFS) + ret = ARRAY_SIZE(iov); + else if (ret < 0) + return ret; - ret = copy_from_iter(dst, len, &iter); + iov_iter_bvec(&iter, READ, iov, ret, translated); - return ret; + ret = copy_from_iter(dst, translated, &iter); + if (ret < 0) + return ret; + + src += translated; + dst += translated; + total_translated += translated; + } + + return total_translated; } static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { - struct iov_iter iter; - struct bio_vec iov[16]; - int ret; + u64 total_translated = 0; - ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, - len, iov, 16, VHOST_MAP_WO); - if (ret < 0) - return ret; + while (total_translated < len) { + struct bio_vec iov[16]; + struct iov_iter iter; + u64 translated; + int ret; + + ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, + len - total_translated, &translated, + iov, ARRAY_SIZE(iov), VHOST_MAP_WO); + if (ret == -ENOBUFS) + ret = ARRAY_SIZE(iov); + else if (ret < 0) + return ret; - iov_iter_bvec(&iter, WRITE, iov, ret, len); + iov_iter_bvec(&iter, WRITE, iov, ret, translated); + + ret = copy_to_iter(src, translated, &iter); + if (ret < 0) + return ret; + + src += translated; + dst += translated; + total_translated += translated; + } - return copy_to_iter(src, len, &iter); + return total_translated; } static inline int getu16_iotlb(const struct vringh *vrh, @@ -1183,7 +1217,7 @@ static inline int getu16_iotlb(const struct vringh *vrh, int ret; /* Atomic read is needed for getu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, &iov, 1, VHOST_MAP_RO); if (ret < 0) return ret; @@ -1204,7 +1238,7 @@ static inline int putu16_iotlb(const struct vringh *vrh, int ret; /* Atomic write is needed for putu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, &iov, 1, VHOST_MAP_WO); if (ret < 0) return ret; diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index 56c77f63cd22..0a53a61231c2 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -35,11 +35,12 @@ if VIRTIO_MENU config VIRTIO_HARDEN_NOTIFICATION bool "Harden virtio notification" + depends on BROKEN help Enable this to harden the device notifications and suppress those that happen at a time where notifications are illegal. - Experimental: Note that several drivers still have bugs that + Experimental: Note that several drivers still have issues that may cause crashes or hangs when correct handling of notifications is enforced; depending on the subset of drivers and devices you use, this may or may not work. @@ -126,9 +127,11 @@ config VIRTIO_MEM This driver provides access to virtio-mem paravirtualized memory devices, allowing to hotplug and hotunplug memory. - This driver was only tested under x86-64 and arm64, but should - theoretically work on all architectures that support memory hotplug - and hotremove. + This driver currently only supports x86-64 and arm64. Although it + should compile on other architectures that implement memory + hot(un)plug, architecture-specific and/or common + code changes may be required for virtio-mem, kdump and kexec to work as + expected. If unsure, say M. diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 14c142d77fba..828ced060742 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -428,7 +428,9 @@ int register_virtio_device(struct virtio_device *dev) goto out; dev->index = err; - dev_set_name(&dev->dev, "virtio%u", dev->index); + err = dev_set_name(&dev->dev, "virtio%u", dev->index); + if (err) + goto out_ida_remove; err = virtio_device_of_init(dev); if (err) diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 083ff1eb743d..c492a57531c6 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -360,7 +360,7 @@ static void vm_synchronize_cbs(struct virtio_device *vdev) static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), - const char *name, bool ctx) + const char *name, u32 size, bool ctx) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); struct virtio_mmio_vq_info *info; @@ -395,14 +395,19 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned int in goto error_new_virtqueue; } + if (!size || size > num) + size = num; + /* Create the vring */ - vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, + vq = vring_create_virtqueue(index, size, VIRTIO_MMIO_VRING_ALIGN, vdev, true, true, ctx, vm_notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; } + vq->num_max = num; + /* Activate the queue */ writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); if (vm_dev->version == 1) { @@ -472,6 +477,7 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], + u32 sizes[], const bool *ctx, struct irq_affinity *desc) { @@ -487,6 +493,9 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs, if (err) return err; + if (of_property_read_bool(vm_dev->pdev->dev.of_node, "wakeup-source")) + enable_irq_wake(irq); + for (i = 0; i < nvqs; ++i) { if (!names[i]) { vqs[i] = NULL; @@ -494,6 +503,7 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned int nvqs, } vqs[i] = vm_setup_vq(vdev, queue_idx++, callbacks[i], names[i], + sizes ? sizes[i] : 0, ctx ? ctx[i] : false); if (IS_ERR(vqs[i])) { vm_del_vqs(vdev); diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index ca51fcc9daab..00ad476a815d 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -174,6 +174,7 @@ error: static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, + u32 size, bool ctx, u16 msix_vec) { @@ -186,7 +187,7 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int in if (!info) return ERR_PTR(-ENOMEM); - vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, + vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, size, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; @@ -214,9 +215,15 @@ static void vp_del_vq(struct virtqueue *vq) struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; unsigned long flags; - spin_lock_irqsave(&vp_dev->lock, flags); - list_del(&info->node); - spin_unlock_irqrestore(&vp_dev->lock, flags); + /* + * If it fails during re-enable reset vq. This way we won't rejoin + * info->node to the queue. Prevent unexpected irqs. + */ + if (!vq->reset) { + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); + } vp_dev->del_vq(info); kfree(info); @@ -277,7 +284,7 @@ void vp_del_vqs(struct virtio_device *vdev) static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], bool per_vq_vectors, + const char * const names[], u32 sizes[], bool per_vq_vectors, const bool *ctx, struct irq_affinity *desc) { @@ -320,8 +327,8 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, else msix_vec = VP_MSIX_VQ_VECTOR; vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i], - ctx ? ctx[i] : false, - msix_vec); + sizes ? sizes[i] : 0, + ctx ? ctx[i] : false, msix_vec); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; @@ -351,7 +358,7 @@ error_find: static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], const bool *ctx) + const char * const names[], u32 sizes[], const bool *ctx) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i, err, queue_idx = 0; @@ -373,6 +380,7 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, continue; } vqs[i] = vp_setup_vq(vdev, queue_idx++, callbacks[i], names[i], + sizes ? sizes[i] : 0, ctx ? ctx[i] : false, VIRTIO_MSI_NO_VECTOR); if (IS_ERR(vqs[i])) { @@ -390,21 +398,21 @@ out_del_vqs: /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], const bool *ctx, + const char * const names[], u32 sizes[], const bool *ctx, struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ - err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc); + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, sizes, true, ctx, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ - err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc); + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, sizes, false, ctx, desc); if (!err) return 0; /* Finally fall back to regular interrupts. */ - return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx); + return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, sizes, ctx); } const char *vp_bus_name(struct virtio_device *vdev) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 23112d84218f..c0448378b698 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -80,6 +80,7 @@ struct virtio_pci_device { unsigned int idx, void (*callback)(struct virtqueue *vq), const char *name, + u32 size, bool ctx, u16 msix_vec); void (*del_vq)(struct virtio_pci_vq_info *info); @@ -110,7 +111,7 @@ void vp_del_vqs(struct virtio_device *vdev); /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], const bool *ctx, + const char * const names[], u32 sizes[], const bool *ctx, struct irq_affinity *desc); const char *vp_bus_name(struct virtio_device *vdev); diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c index a5e5721145c7..d75e5c4e637f 100644 --- a/drivers/virtio/virtio_pci_legacy.c +++ b/drivers/virtio/virtio_pci_legacy.c @@ -112,6 +112,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, + u32 size, bool ctx, u16 msix_vec) { @@ -125,16 +126,21 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (!num || vp_legacy_get_queue_enable(&vp_dev->ldev, index)) return ERR_PTR(-ENOENT); + if (!size || size > num) + size = num; + info->msix_vector = msix_vec; /* create the vring */ - vq = vring_create_virtqueue(index, num, + vq = vring_create_virtqueue(index, size, VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev, true, false, ctx, vp_notify, callback, name); if (!vq) return ERR_PTR(-ENOMEM); + vq->num_max = num; + q_pfn = virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT; if (q_pfn >> 32) { dev_err(&vp_dev->pci_dev->dev, diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 623906b4996c..f7965c5dd36b 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -34,6 +34,9 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features) if ((features & BIT_ULL(VIRTIO_F_SR_IOV)) && pci_find_ext_capability(pci_dev, PCI_EXT_CAP_ID_SRIOV)) __virtio_set_bit(vdev, VIRTIO_F_SR_IOV); + + if (features & BIT_ULL(VIRTIO_F_RING_RESET)) + __virtio_set_bit(vdev, VIRTIO_F_RING_RESET); } /* virtio config->finalize_features() implementation */ @@ -176,6 +179,110 @@ static void vp_reset(struct virtio_device *vdev) vp_synchronize_vectors(vdev); } +static int vp_active_vq(struct virtqueue *vq, u16 msix_vec) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + unsigned long index; + + index = vq->index; + + /* activate the queue */ + vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq)); + vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), + virtqueue_get_avail_addr(vq), + virtqueue_get_used_addr(vq)); + + if (msix_vec != VIRTIO_MSI_NO_VECTOR) { + msix_vec = vp_modern_queue_vector(mdev, index, msix_vec); + if (msix_vec == VIRTIO_MSI_NO_VECTOR) + return -EBUSY; + } + + return 0; +} + +static int vp_modern_disable_vq_and_reset(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_vq_info *info; + unsigned long flags; + + if (!virtio_has_feature(vq->vdev, VIRTIO_F_RING_RESET)) + return -ENOENT; + + vp_modern_set_queue_reset(mdev, vq->index); + + info = vp_dev->vqs[vq->index]; + + /* delete vq from irq handler */ + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); + + INIT_LIST_HEAD(&info->node); + +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + __virtqueue_break(vq); +#endif + + /* For the case where vq has an exclusive irq, call synchronize_irq() to + * wait for completion. + * + * note: We can't use disable_irq() since it conflicts with the affinity + * managed IRQ that is used by some drivers. + */ + if (vp_dev->per_vq_vectors && info->msix_vector != VIRTIO_MSI_NO_VECTOR) + synchronize_irq(pci_irq_vector(vp_dev->pci_dev, info->msix_vector)); + + vq->reset = true; + + return 0; +} + +static int vp_modern_enable_vq_after_reset(struct virtqueue *vq) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); + struct virtio_pci_modern_device *mdev = &vp_dev->mdev; + struct virtio_pci_vq_info *info; + unsigned long flags, index; + int err; + + if (!vq->reset) + return -EBUSY; + + index = vq->index; + info = vp_dev->vqs[index]; + + if (vp_modern_get_queue_reset(mdev, index)) + return -EBUSY; + + if (vp_modern_get_queue_enable(mdev, index)) + return -EBUSY; + + err = vp_active_vq(vq, info->msix_vector); + if (err) + return err; + + if (vq->callback) { + spin_lock_irqsave(&vp_dev->lock, flags); + list_add(&info->node, &vp_dev->virtqueues); + spin_unlock_irqrestore(&vp_dev->lock, flags); + } else { + INIT_LIST_HEAD(&info->node); + } + +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + __virtqueue_unbreak(vq); +#endif + + vp_modern_set_queue_enable(&vp_dev->mdev, index, true); + vq->reset = false; + + return 0; +} + static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) { return vp_modern_config_vector(&vp_dev->mdev, vector); @@ -186,6 +293,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, + u32 size, bool ctx, u16 msix_vec) { @@ -203,47 +311,39 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); - if (num & (num - 1)) { - dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", num); + if (!size || size > num) + size = num; + + if (size & (size - 1)) { + dev_warn(&vp_dev->pci_dev->dev, "bad queue size %u", size); return ERR_PTR(-EINVAL); } info->msix_vector = msix_vec; /* create the vring */ - vq = vring_create_virtqueue(index, num, + vq = vring_create_virtqueue(index, size, SMP_CACHE_BYTES, &vp_dev->vdev, true, true, ctx, vp_notify, callback, name); if (!vq) return ERR_PTR(-ENOMEM); - /* activate the queue */ - vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq)); - vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), - virtqueue_get_avail_addr(vq), - virtqueue_get_used_addr(vq)); + vq->num_max = num; + + err = vp_active_vq(vq, msix_vec); + if (err) + goto err; vq->priv = (void __force *)vp_modern_map_vq_notify(mdev, index, NULL); if (!vq->priv) { err = -ENOMEM; - goto err_map_notify; - } - - if (msix_vec != VIRTIO_MSI_NO_VECTOR) { - msix_vec = vp_modern_queue_vector(mdev, index, msix_vec); - if (msix_vec == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; - goto err_assign_vector; - } + goto err; } return vq; -err_assign_vector: - if (!mdev->notify_base) - pci_iounmap(mdev->pci_dev, (void __iomem __force *)vq->priv); -err_map_notify: +err: vring_del_virtqueue(vq); return ERR_PTR(err); } @@ -251,12 +351,15 @@ err_map_notify: static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], const bool *ctx, + const char * const names[], + u32 sizes[], + const bool *ctx, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; - int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); + int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, sizes, ctx, + desc); if (rc) return rc; @@ -401,6 +504,8 @@ static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .set_vq_affinity = vp_set_vq_affinity, .get_vq_affinity = vp_get_vq_affinity, .get_shm_region = vp_get_shm_region, + .disable_vq_and_reset = vp_modern_disable_vq_and_reset, + .enable_vq_after_reset = vp_modern_enable_vq_after_reset, }; static const struct virtio_config_ops virtio_pci_config_ops = { @@ -419,6 +524,8 @@ static const struct virtio_config_ops virtio_pci_config_ops = { .set_vq_affinity = vp_set_vq_affinity, .get_vq_affinity = vp_get_vq_affinity, .get_shm_region = vp_get_shm_region, + .disable_vq_and_reset = vp_modern_disable_vq_and_reset, + .enable_vq_after_reset = vp_modern_enable_vq_after_reset, }; /* the PCI probing function */ diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index fa2a9445bb18..869cb46bef96 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -3,6 +3,7 @@ #include <linux/virtio_pci_modern.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/delay.h> /* * vp_modern_map_capability - map a part of virtio pci capability @@ -475,6 +476,44 @@ void vp_modern_set_status(struct virtio_pci_modern_device *mdev, EXPORT_SYMBOL_GPL(vp_modern_set_status); /* + * vp_modern_get_queue_reset - get the queue reset status + * @mdev: the modern virtio-pci device + * @index: queue index + */ +int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index) +{ + struct virtio_pci_modern_common_cfg __iomem *cfg; + + cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common; + + vp_iowrite16(index, &cfg->cfg.queue_select); + return vp_ioread16(&cfg->queue_reset); +} +EXPORT_SYMBOL_GPL(vp_modern_get_queue_reset); + +/* + * vp_modern_set_queue_reset - reset the queue + * @mdev: the modern virtio-pci device + * @index: queue index + */ +void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index) +{ + struct virtio_pci_modern_common_cfg __iomem *cfg; + + cfg = (struct virtio_pci_modern_common_cfg __iomem *)mdev->common; + + vp_iowrite16(index, &cfg->cfg.queue_select); + vp_iowrite16(1, &cfg->queue_reset); + + while (vp_ioread16(&cfg->queue_reset)) + msleep(1); + + while (vp_ioread16(&cfg->cfg.queue_enable)) + msleep(1); +} +EXPORT_SYMBOL_GPL(vp_modern_set_queue_reset); + +/* * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue * @mdev: the modern virtio-pci device * @index: queue index diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 643ca779fcc6..d66c8e6d0ef3 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -85,6 +85,71 @@ struct vring_desc_extra { u16 next; /* The next desc state in a list. */ }; +struct vring_virtqueue_split { + /* Actual memory layout for this queue. */ + struct vring vring; + + /* Last written value to avail->flags */ + u16 avail_flags_shadow; + + /* + * Last written value to avail->idx in + * guest byte order. + */ + u16 avail_idx_shadow; + + /* Per-descriptor state. */ + struct vring_desc_state_split *desc_state; + struct vring_desc_extra *desc_extra; + + /* DMA address and size information */ + dma_addr_t queue_dma_addr; + size_t queue_size_in_bytes; + + /* + * The parameters for creating vrings are reserved for creating new + * vring. + */ + u32 vring_align; + bool may_reduce_num; +}; + +struct vring_virtqueue_packed { + /* Actual memory layout for this queue. */ + struct { + unsigned int num; + struct vring_packed_desc *desc; + struct vring_packed_desc_event *driver; + struct vring_packed_desc_event *device; + } vring; + + /* Driver ring wrap counter. */ + bool avail_wrap_counter; + + /* Avail used flags. */ + u16 avail_used_flags; + + /* Index of the next avail descriptor. */ + u16 next_avail_idx; + + /* + * Last written value to driver->flags in + * guest byte order. + */ + u16 event_flags_shadow; + + /* Per-descriptor state. */ + struct vring_desc_state_packed *desc_state; + struct vring_desc_extra *desc_extra; + + /* DMA address and size information */ + dma_addr_t ring_dma_addr; + dma_addr_t driver_event_dma_addr; + dma_addr_t device_event_dma_addr; + size_t ring_size_in_bytes; + size_t event_size_in_bytes; +}; + struct vring_virtqueue { struct virtqueue vq; @@ -124,64 +189,10 @@ struct vring_virtqueue { union { /* Available for split ring */ - struct { - /* Actual memory layout for this queue. */ - struct vring vring; - - /* Last written value to avail->flags */ - u16 avail_flags_shadow; - - /* - * Last written value to avail->idx in - * guest byte order. - */ - u16 avail_idx_shadow; - - /* Per-descriptor state. */ - struct vring_desc_state_split *desc_state; - struct vring_desc_extra *desc_extra; - - /* DMA address and size information */ - dma_addr_t queue_dma_addr; - size_t queue_size_in_bytes; - } split; + struct vring_virtqueue_split split; /* Available for packed ring */ - struct { - /* Actual memory layout for this queue. */ - struct { - unsigned int num; - struct vring_packed_desc *desc; - struct vring_packed_desc_event *driver; - struct vring_packed_desc_event *device; - } vring; - - /* Driver ring wrap counter. */ - bool avail_wrap_counter; - - /* Avail used flags. */ - u16 avail_used_flags; - - /* Index of the next avail descriptor. */ - u16 next_avail_idx; - - /* - * Last written value to driver->flags in - * guest byte order. - */ - u16 event_flags_shadow; - - /* Per-descriptor state. */ - struct vring_desc_state_packed *desc_state; - struct vring_desc_extra *desc_extra; - - /* DMA address and size information */ - dma_addr_t ring_dma_addr; - dma_addr_t driver_event_dma_addr; - dma_addr_t device_event_dma_addr; - size_t ring_size_in_bytes; - size_t event_size_in_bytes; - } packed; + struct vring_virtqueue_packed packed; }; /* How to notify other side. FIXME: commonalize hcalls! */ @@ -200,6 +211,16 @@ struct vring_virtqueue { #endif }; +static struct virtqueue *__vring_new_virtqueue(unsigned int index, + struct vring_virtqueue_split *vring_split, + struct virtio_device *vdev, + bool weak_barriers, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name); +static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num); +static void vring_free(struct virtqueue *_vq); /* * Helpers. @@ -364,6 +385,24 @@ static int vring_mapping_error(const struct vring_virtqueue *vq, return dma_mapping_error(vring_dma_dev(vq), addr); } +static void virtqueue_init(struct vring_virtqueue *vq, u32 num) +{ + vq->vq.num_free = num; + + if (vq->packed_ring) + vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); + else + vq->last_used_idx = 0; + + vq->event_triggered = false; + vq->num_added = 0; + +#ifdef DEBUG + vq->in_use = false; + vq->last_add_time_valid = false; +#endif +} + /* * Split ring specific functions - *_split(). @@ -907,28 +946,107 @@ static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) return NULL; } -static struct virtqueue *vring_create_virtqueue_split( - unsigned int index, - unsigned int num, - unsigned int vring_align, - struct virtio_device *vdev, - bool weak_barriers, - bool may_reduce_num, - bool context, - bool (*notify)(struct virtqueue *), - void (*callback)(struct virtqueue *), - const char *name) +static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split, + struct vring_virtqueue *vq) +{ + struct virtio_device *vdev; + + vdev = vq->vq.vdev; + + vring_split->avail_flags_shadow = 0; + vring_split->avail_idx_shadow = 0; + + /* No callback? Tell other side not to bother us. */ + if (!vq->vq.callback) { + vring_split->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; + if (!vq->event) + vring_split->vring.avail->flags = cpu_to_virtio16(vdev, + vring_split->avail_flags_shadow); + } +} + +static void virtqueue_reinit_split(struct vring_virtqueue *vq) +{ + int num; + + num = vq->split.vring.num; + + vq->split.vring.avail->flags = 0; + vq->split.vring.avail->idx = 0; + + /* reset avail event */ + vq->split.vring.avail->ring[num] = 0; + + vq->split.vring.used->flags = 0; + vq->split.vring.used->idx = 0; + + /* reset used event */ + *(__virtio16 *)&(vq->split.vring.used->ring[num]) = 0; + + virtqueue_init(vq, num); + + virtqueue_vring_init_split(&vq->split, vq); +} + +static void virtqueue_vring_attach_split(struct vring_virtqueue *vq, + struct vring_virtqueue_split *vring_split) +{ + vq->split = *vring_split; + + /* Put everything in free lists. */ + vq->free_head = 0; +} + +static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split) +{ + struct vring_desc_state_split *state; + struct vring_desc_extra *extra; + u32 num = vring_split->vring.num; + + state = kmalloc_array(num, sizeof(struct vring_desc_state_split), GFP_KERNEL); + if (!state) + goto err_state; + + extra = vring_alloc_desc_extra(num); + if (!extra) + goto err_extra; + + memset(state, 0, num * sizeof(struct vring_desc_state_split)); + + vring_split->desc_state = state; + vring_split->desc_extra = extra; + return 0; + +err_extra: + kfree(state); +err_state: + return -ENOMEM; +} + +static void vring_free_split(struct vring_virtqueue_split *vring_split, + struct virtio_device *vdev) +{ + vring_free_queue(vdev, vring_split->queue_size_in_bytes, + vring_split->vring.desc, + vring_split->queue_dma_addr); + + kfree(vring_split->desc_state); + kfree(vring_split->desc_extra); +} + +static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, + struct virtio_device *vdev, + u32 num, + unsigned int vring_align, + bool may_reduce_num) { - struct virtqueue *vq; void *queue = NULL; dma_addr_t dma_addr; - size_t queue_size_in_bytes; - struct vring vring; /* We assume num is a power of 2. */ if (num & (num - 1)) { dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); - return NULL; + return -EINVAL; } /* TODO: allocate each queue chunk individually */ @@ -939,11 +1057,11 @@ static struct virtqueue *vring_create_virtqueue_split( if (queue) break; if (!may_reduce_num) - return NULL; + return -ENOMEM; } if (!num) - return NULL; + return -ENOMEM; if (!queue) { /* Try to get a single page. You are my only hope! */ @@ -951,26 +1069,85 @@ static struct virtqueue *vring_create_virtqueue_split( &dma_addr, GFP_KERNEL|__GFP_ZERO); } if (!queue) - return NULL; + return -ENOMEM; + + vring_init(&vring_split->vring, num, queue, vring_align); + + vring_split->queue_dma_addr = dma_addr; + vring_split->queue_size_in_bytes = vring_size(num, vring_align); - queue_size_in_bytes = vring_size(num, vring_align); - vring_init(&vring, num, queue, vring_align); + vring_split->vring_align = vring_align; + vring_split->may_reduce_num = may_reduce_num; - vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, - notify, callback, name); + return 0; +} + +static struct virtqueue *vring_create_virtqueue_split( + unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool may_reduce_num, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) +{ + struct vring_virtqueue_split vring_split = {}; + struct virtqueue *vq; + int err; + + err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align, + may_reduce_num); + if (err) + return NULL; + + vq = __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers, + context, notify, callback, name); if (!vq) { - vring_free_queue(vdev, queue_size_in_bytes, queue, - dma_addr); + vring_free_split(&vring_split, vdev); return NULL; } - to_vvq(vq)->split.queue_dma_addr = dma_addr; - to_vvq(vq)->split.queue_size_in_bytes = queue_size_in_bytes; to_vvq(vq)->we_own_ring = true; return vq; } +static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) +{ + struct vring_virtqueue_split vring_split = {}; + struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; + int err; + + err = vring_alloc_queue_split(&vring_split, vdev, num, + vq->split.vring_align, + vq->split.may_reduce_num); + if (err) + goto err; + + err = vring_alloc_state_extra_split(&vring_split); + if (err) + goto err_state_extra; + + vring_free(&vq->vq); + + virtqueue_vring_init_split(&vring_split, vq); + + virtqueue_init(vq, vring_split.vring.num); + virtqueue_vring_attach_split(vq, &vring_split); + + return 0; + +err_state_extra: + vring_free_split(&vring_split, vdev); +err: + virtqueue_reinit_split(vq); + return -ENOMEM; +} + /* * Packed ring specific functions - *_packed(). @@ -1637,8 +1814,7 @@ static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) return NULL; } -static struct vring_desc_extra *vring_alloc_desc_extra(struct vring_virtqueue *vq, - unsigned int num) +static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) { struct vring_desc_extra *desc_extra; unsigned int i; @@ -1656,19 +1832,32 @@ static struct vring_desc_extra *vring_alloc_desc_extra(struct vring_virtqueue *v return desc_extra; } -static struct virtqueue *vring_create_virtqueue_packed( - unsigned int index, - unsigned int num, - unsigned int vring_align, - struct virtio_device *vdev, - bool weak_barriers, - bool may_reduce_num, - bool context, - bool (*notify)(struct virtqueue *), - void (*callback)(struct virtqueue *), - const char *name) +static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, + struct virtio_device *vdev) +{ + if (vring_packed->vring.desc) + vring_free_queue(vdev, vring_packed->ring_size_in_bytes, + vring_packed->vring.desc, + vring_packed->ring_dma_addr); + + if (vring_packed->vring.driver) + vring_free_queue(vdev, vring_packed->event_size_in_bytes, + vring_packed->vring.driver, + vring_packed->driver_event_dma_addr); + + if (vring_packed->vring.device) + vring_free_queue(vdev, vring_packed->event_size_in_bytes, + vring_packed->vring.device, + vring_packed->device_event_dma_addr); + + kfree(vring_packed->desc_state); + kfree(vring_packed->desc_extra); +} + +static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, + struct virtio_device *vdev, + u32 num) { - struct vring_virtqueue *vq; struct vring_packed_desc *ring; struct vring_packed_desc_event *driver, *device; dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; @@ -1680,7 +1869,11 @@ static struct virtqueue *vring_create_virtqueue_packed( &ring_dma_addr, GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); if (!ring) - goto err_ring; + goto err; + + vring_packed->vring.desc = ring; + vring_packed->ring_dma_addr = ring_dma_addr; + vring_packed->ring_size_in_bytes = ring_size_in_bytes; event_size_in_bytes = sizeof(struct vring_packed_desc_event); @@ -1688,13 +1881,112 @@ static struct virtqueue *vring_create_virtqueue_packed( &driver_event_dma_addr, GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); if (!driver) - goto err_driver; + goto err; + + vring_packed->vring.driver = driver; + vring_packed->event_size_in_bytes = event_size_in_bytes; + vring_packed->driver_event_dma_addr = driver_event_dma_addr; device = vring_alloc_queue(vdev, event_size_in_bytes, &device_event_dma_addr, GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); if (!device) - goto err_device; + goto err; + + vring_packed->vring.device = device; + vring_packed->device_event_dma_addr = device_event_dma_addr; + + vring_packed->vring.num = num; + + return 0; + +err: + vring_free_packed(vring_packed, vdev); + return -ENOMEM; +} + +static int vring_alloc_state_extra_packed(struct vring_virtqueue_packed *vring_packed) +{ + struct vring_desc_state_packed *state; + struct vring_desc_extra *extra; + u32 num = vring_packed->vring.num; + + state = kmalloc_array(num, sizeof(struct vring_desc_state_packed), GFP_KERNEL); + if (!state) + goto err_desc_state; + + memset(state, 0, num * sizeof(struct vring_desc_state_packed)); + + extra = vring_alloc_desc_extra(num); + if (!extra) + goto err_desc_extra; + + vring_packed->desc_state = state; + vring_packed->desc_extra = extra; + + return 0; + +err_desc_extra: + kfree(state); +err_desc_state: + return -ENOMEM; +} + +static void virtqueue_vring_init_packed(struct vring_virtqueue_packed *vring_packed, + bool callback) +{ + vring_packed->next_avail_idx = 0; + vring_packed->avail_wrap_counter = 1; + vring_packed->event_flags_shadow = 0; + vring_packed->avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL; + + /* No callback? Tell other side not to bother us. */ + if (!callback) { + vring_packed->event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; + vring_packed->vring.driver->flags = + cpu_to_le16(vring_packed->event_flags_shadow); + } +} + +static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq, + struct vring_virtqueue_packed *vring_packed) +{ + vq->packed = *vring_packed; + + /* Put everything in free lists. */ + vq->free_head = 0; +} + +static void virtqueue_reinit_packed(struct vring_virtqueue *vq) +{ + memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes); + memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes); + + /* we need to reset the desc.flags. For more, see is_used_desc_packed() */ + memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes); + + virtqueue_init(vq, vq->packed.vring.num); + virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback); +} + +static struct virtqueue *vring_create_virtqueue_packed( + unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool may_reduce_num, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) +{ + struct vring_virtqueue_packed vring_packed = {}; + struct vring_virtqueue *vq; + int err; + + if (vring_alloc_queue_packed(&vring_packed, vdev, num)) + goto err_ring; vq = kmalloc(sizeof(*vq), GFP_KERNEL); if (!vq) @@ -1703,8 +1995,8 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; - vq->vq.num_free = num; vq->vq.index = index; + vq->vq.reset = false; vq->we_own_ring = true; vq->notify = notify; vq->weak_barriers = weak_barriers; @@ -1713,15 +2005,8 @@ static struct virtqueue *vring_create_virtqueue_packed( #else vq->broken = false; #endif - vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); - vq->event_triggered = false; - vq->num_added = 0; vq->packed_ring = true; vq->use_dma_api = vring_use_dma_api(vdev); -#ifdef DEBUG - vq->in_use = false; - vq->last_add_time_valid = false; -#endif vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -1730,65 +2015,58 @@ static struct virtqueue *vring_create_virtqueue_packed( if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; - vq->packed.ring_dma_addr = ring_dma_addr; - vq->packed.driver_event_dma_addr = driver_event_dma_addr; - vq->packed.device_event_dma_addr = device_event_dma_addr; - - vq->packed.ring_size_in_bytes = ring_size_in_bytes; - vq->packed.event_size_in_bytes = event_size_in_bytes; - - vq->packed.vring.num = num; - vq->packed.vring.desc = ring; - vq->packed.vring.driver = driver; - vq->packed.vring.device = device; - - vq->packed.next_avail_idx = 0; - vq->packed.avail_wrap_counter = 1; - vq->packed.event_flags_shadow = 0; - vq->packed.avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL; - - vq->packed.desc_state = kmalloc_array(num, - sizeof(struct vring_desc_state_packed), - GFP_KERNEL); - if (!vq->packed.desc_state) - goto err_desc_state; - - memset(vq->packed.desc_state, 0, - num * sizeof(struct vring_desc_state_packed)); - - /* Put everything in free lists. */ - vq->free_head = 0; + err = vring_alloc_state_extra_packed(&vring_packed); + if (err) + goto err_state_extra; - vq->packed.desc_extra = vring_alloc_desc_extra(vq, num); - if (!vq->packed.desc_extra) - goto err_desc_extra; + virtqueue_vring_init_packed(&vring_packed, !!callback); - /* No callback? Tell other side not to bother us. */ - if (!callback) { - vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; - vq->packed.vring.driver->flags = - cpu_to_le16(vq->packed.event_flags_shadow); - } + virtqueue_init(vq, num); + virtqueue_vring_attach_packed(vq, &vring_packed); spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); spin_unlock(&vdev->vqs_list_lock); return &vq->vq; -err_desc_extra: - kfree(vq->packed.desc_state); -err_desc_state: +err_state_extra: kfree(vq); err_vq: - vring_free_queue(vdev, event_size_in_bytes, device, device_event_dma_addr); -err_device: - vring_free_queue(vdev, event_size_in_bytes, driver, driver_event_dma_addr); -err_driver: - vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr); + vring_free_packed(&vring_packed, vdev); err_ring: return NULL; } +static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) +{ + struct vring_virtqueue_packed vring_packed = {}; + struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; + int err; + + if (vring_alloc_queue_packed(&vring_packed, vdev, num)) + goto err_ring; + + err = vring_alloc_state_extra_packed(&vring_packed); + if (err) + goto err_state_extra; + + vring_free(&vq->vq); + + virtqueue_vring_init_packed(&vring_packed, !!vq->vq.callback); + + virtqueue_init(vq, vring_packed.vring.num); + virtqueue_vring_attach_packed(vq, &vring_packed); + + return 0; + +err_state_extra: + vring_free_packed(&vring_packed, vdev); +err_ring: + virtqueue_reinit_packed(vq); + return -ENOMEM; +} + /* * Generic functions and exported symbols. @@ -2131,8 +2409,8 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); * @_vq: the struct virtqueue we're talking about. * * Returns NULL or the "data" token handed to virtqueue_add_*(). - * This is not valid on an active queue; it is useful only for device - * shutdown. + * This is not valid on an active queue; it is useful for device + * shutdown or the reset queue. */ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) { @@ -2180,16 +2458,17 @@ irqreturn_t vring_interrupt(int irq, void *_vq) EXPORT_SYMBOL_GPL(vring_interrupt); /* Only available for split ring */ -struct virtqueue *__vring_new_virtqueue(unsigned int index, - struct vring vring, - struct virtio_device *vdev, - bool weak_barriers, - bool context, - bool (*notify)(struct virtqueue *), - void (*callback)(struct virtqueue *), - const char *name) +static struct virtqueue *__vring_new_virtqueue(unsigned int index, + struct vring_virtqueue_split *vring_split, + struct virtio_device *vdev, + bool weak_barriers, + bool context, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) { struct vring_virtqueue *vq; + int err; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return NULL; @@ -2202,8 +2481,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; - vq->vq.num_free = vring.num; vq->vq.index = index; + vq->vq.reset = false; vq->we_own_ring = false; vq->notify = notify; vq->weak_barriers = weak_barriers; @@ -2212,14 +2491,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, #else vq->broken = false; #endif - vq->last_used_idx = 0; - vq->event_triggered = false; - vq->num_added = 0; vq->use_dma_api = vring_use_dma_api(vdev); -#ifdef DEBUG - vq->in_use = false; - vq->last_add_time_valid = false; -#endif vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2228,47 +2500,22 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; - vq->split.queue_dma_addr = 0; - vq->split.queue_size_in_bytes = 0; - - vq->split.vring = vring; - vq->split.avail_flags_shadow = 0; - vq->split.avail_idx_shadow = 0; - - /* No callback? Tell other side not to bother us. */ - if (!callback) { - vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; - if (!vq->event) - vq->split.vring.avail->flags = cpu_to_virtio16(vdev, - vq->split.avail_flags_shadow); + err = vring_alloc_state_extra_split(vring_split); + if (err) { + kfree(vq); + return NULL; } - vq->split.desc_state = kmalloc_array(vring.num, - sizeof(struct vring_desc_state_split), GFP_KERNEL); - if (!vq->split.desc_state) - goto err_state; - - vq->split.desc_extra = vring_alloc_desc_extra(vq, vring.num); - if (!vq->split.desc_extra) - goto err_extra; + virtqueue_vring_init_split(vring_split, vq); - /* Put everything in free lists. */ - vq->free_head = 0; - memset(vq->split.desc_state, 0, vring.num * - sizeof(struct vring_desc_state_split)); + virtqueue_init(vq, vring_split->vring.num); + virtqueue_vring_attach_split(vq, vring_split); spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); spin_unlock(&vdev->vqs_list_lock); return &vq->vq; - -err_extra: - kfree(vq->split.desc_state); -err_state: - kfree(vq); - return NULL; } -EXPORT_SYMBOL_GPL(__vring_new_virtqueue); struct virtqueue *vring_create_virtqueue( unsigned int index, @@ -2294,6 +2541,75 @@ struct virtqueue *vring_create_virtqueue( } EXPORT_SYMBOL_GPL(vring_create_virtqueue); +/** + * virtqueue_resize - resize the vring of vq + * @_vq: the struct virtqueue we're talking about. + * @num: new ring num + * @recycle: callback for recycle the useless buffer + * + * When it is really necessary to create a new vring, it will set the current vq + * into the reset state. Then call the passed callback to recycle the buffer + * that is no longer used. Only after the new vring is successfully created, the + * old vring will be released. + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error. + * 0: success. + * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size. + * vq can still work normally + * -EBUSY: Failed to sync with device, vq may not work properly + * -ENOENT: Transport or device not supported + * -E2BIG/-EINVAL: num error + * -EPERM: Operation not permitted + * + */ +int virtqueue_resize(struct virtqueue *_vq, u32 num, + void (*recycle)(struct virtqueue *vq, void *buf)) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = vq->vq.vdev; + void *buf; + int err; + + if (!vq->we_own_ring) + return -EPERM; + + if (num > vq->vq.num_max) + return -E2BIG; + + if (!num) + return -EINVAL; + + if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num) + return 0; + + if (!vdev->config->disable_vq_and_reset) + return -ENOENT; + + if (!vdev->config->enable_vq_after_reset) + return -ENOENT; + + err = vdev->config->disable_vq_and_reset(_vq); + if (err) + return err; + + while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL) + recycle(_vq, buf); + + if (vq->packed_ring) + err = virtqueue_resize_packed(_vq, num); + else + err = virtqueue_resize_split(_vq, num); + + if (vdev->config->enable_vq_after_reset(_vq)) + return -EBUSY; + + return err; +} +EXPORT_SYMBOL_GPL(virtqueue_resize); + /* Only available for split ring */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, @@ -2306,25 +2622,21 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, void (*callback)(struct virtqueue *vq), const char *name) { - struct vring vring; + struct vring_virtqueue_split vring_split = {}; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return NULL; - vring_init(&vring, num, pages, vring_align); - return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, - notify, callback, name); + vring_init(&vring_split.vring, num, pages, vring_align); + return __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers, + context, notify, callback, name); } EXPORT_SYMBOL_GPL(vring_new_virtqueue); -void vring_del_virtqueue(struct virtqueue *_vq) +static void vring_free(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - spin_lock(&vq->vq.vdev->vqs_list_lock); - list_del(&_vq->list); - spin_unlock(&vq->vq.vdev->vqs_list_lock); - if (vq->we_own_ring) { if (vq->packed_ring) { vring_free_queue(vq->vq.vdev, @@ -2355,6 +2667,18 @@ void vring_del_virtqueue(struct virtqueue *_vq) kfree(vq->split.desc_state); kfree(vq->split.desc_extra); } +} + +void vring_del_virtqueue(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + spin_lock(&vq->vq.vdev->vqs_list_lock); + list_del(&_vq->list); + spin_unlock(&vq->vq.vdev->vqs_list_lock); + + vring_free(_vq); + kfree(vq); } EXPORT_SYMBOL_GPL(vring_del_virtqueue); @@ -2402,6 +2726,30 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); +/* + * This function should only be called by the core, not directly by the driver. + */ +void __virtqueue_break(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, true); +} +EXPORT_SYMBOL_GPL(__virtqueue_break); + +/* + * This function should only be called by the core, not directly by the driver. + */ +void __virtqueue_unbreak(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, false); +} +EXPORT_SYMBOL_GPL(__virtqueue_unbreak); + bool virtqueue_is_broken(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index c40f7deb6b5a..9bc4d110b800 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -131,7 +131,7 @@ static irqreturn_t virtio_vdpa_virtqueue_cb(void *private) static struct virtqueue * virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), - const char *name, bool ctx) + const char *name, u32 size, bool ctx) { struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev); struct vdpa_device *vdpa = vd_get_vdpa(vdev); @@ -168,14 +168,17 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, goto error_new_virtqueue; } + if (!size || size > max_num) + size = max_num; + if (ops->get_vq_num_min) min_num = ops->get_vq_num_min(vdpa); - may_reduce_num = (max_num == min_num) ? false : true; + may_reduce_num = (size == min_num) ? false : true; /* Create the vring */ align = ops->get_vq_align(vdpa); - vq = vring_create_virtqueue(index, max_num, align, vdev, + vq = vring_create_virtqueue(index, size, align, vdev, true, may_reduce_num, ctx, virtio_vdpa_notify, callback, name); if (!vq) { @@ -183,6 +186,8 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, goto error_new_virtqueue; } + vq->num_max = max_num; + /* Setup virtqueue callback */ cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL; cb.private = info; @@ -267,6 +272,7 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], + u32 sizes[], const bool *ctx, struct irq_affinity *desc) { @@ -282,9 +288,9 @@ static int virtio_vdpa_find_vqs(struct virtio_device *vdev, unsigned int nvqs, continue; } - vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++, - callbacks[i], names[i], ctx ? - ctx[i] : false); + vqs[i] = virtio_vdpa_setup_vq(vdev, queue_idx++, callbacks[i], + names[i], sizes ? sizes[i] : 0, + ctx ? ctx[i] : false); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto err_setup_vq; |