summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2024-03-05 13:10:04 -0700
committerJens Axboe <axboe@kernel.dk>2024-04-22 11:26:05 -0600
commita05d1f625c7aa681d8816bc0f10089289ad07aad (patch)
treeed2c7807c31573f53f802550e24b3d98d87901ac
parent35c8711c8fc4c16ad2749b8314da5829a493e28e (diff)
io_uring/net: support bundles for send
If IORING_OP_SEND is used with provided buffers, the caller may also set IORING_RECVSEND_BUNDLE to turn it into a multi-buffer send. The idea is that an application can fill outgoing buffers in a provided buffer group, and then arm a single send that will service them all. Once there are no more buffers to send, or if the requested length has been sent, the request posts a single completion for all the buffers. This only enables it for IORING_OP_SEND, IORING_OP_SENDMSG is coming in a separate patch. However, this patch does do a lot of the prep work that makes wiring up the sendmsg variant pretty trivial. They share the prep side. Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--include/uapi/linux/io_uring.h9
-rw-r--r--io_uring/net.c145
2 files changed, 137 insertions, 17 deletions
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a7f847543a7f..7f583927c908 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -351,11 +351,20 @@ enum io_uring_op {
* 0 is reported if zerocopy was actually possible.
* IORING_NOTIF_USAGE_ZC_COPIED if data was copied
* (at least partially).
+ *
+ * IORING_RECVSEND_BUNDLE Used with IOSQE_BUFFER_SELECT. If set, send will
+ * grab as many buffers from the buffer group ID
+ * given and send them all. The completion result
+ * will be the number of buffers send, with the
+ * starting buffer ID in cqe->flags as per usual
+ * for provided buffer usage. The buffers will be
+ * contigious from the starting buffer ID.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
#define IORING_RECV_MULTISHOT (1U << 1)
#define IORING_RECVSEND_FIXED_BUF (1U << 2)
#define IORING_SEND_ZC_REPORT_USAGE (1U << 3)
+#define IORING_RECVSEND_BUNDLE (1U << 4)
/*
* cqe.res for IORING_CQE_F_NOTIF if
diff --git a/io_uring/net.c b/io_uring/net.c
index 13685d133582..3e326576254b 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -57,7 +57,7 @@ struct io_sr_msg {
struct user_msghdr __user *umsg;
void __user *buf;
};
- unsigned len;
+ int len;
unsigned done_io;
unsigned msg_flags;
unsigned nr_multishot_loops;
@@ -389,6 +389,8 @@ static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg)
return ret;
}
+#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
+
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -407,11 +409,20 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
- if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ if (sr->flags & ~SENDMSG_FLAGS)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
+ if (sr->flags & IORING_RECVSEND_BUNDLE) {
+ if (req->opcode == IORING_OP_SENDMSG)
+ return -EINVAL;
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return -EINVAL;
+ sr->msg_flags |= MSG_WAITALL;
+ sr->buf_group = req->buf_index;
+ req->buf_list = NULL;
+ }
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
@@ -427,6 +438,79 @@ static void io_req_msg_cleanup(struct io_kiocb *req,
io_netmsg_recycle(req, issue_flags);
}
+/*
+ * For bundle completions, we need to figure out how many segments we consumed.
+ * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
+ * could be using an ITER_IOVEC. If the latter, then if we consumed all of
+ * the segments, then it's a trivial questiont o answer. If we have residual
+ * data in the iter, then loop the segments to figure out how much we
+ * transferred.
+ */
+static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
+{
+ struct iovec *iov;
+ int nbufs;
+
+ /* no data is always zero segments, and a ubuf is always 1 segment */
+ if (ret <= 0)
+ return 0;
+ if (iter_is_ubuf(&kmsg->msg.msg_iter))
+ return 1;
+
+ iov = kmsg->free_iov;
+ if (!iov)
+ iov = &kmsg->fast_iov;
+
+ /* if all data was transferred, it's basic pointer math */
+ if (!iov_iter_count(&kmsg->msg.msg_iter))
+ return iter_iov(&kmsg->msg.msg_iter) - iov;
+
+ /* short transfer, count segments */
+ nbufs = 0;
+ do {
+ int this_len = min_t(int, iov[nbufs].iov_len, ret);
+
+ nbufs++;
+ ret -= this_len;
+ } while (ret);
+
+ return nbufs;
+}
+
+static inline bool io_send_finish(struct io_kiocb *req, int *ret,
+ struct io_async_msghdr *kmsg,
+ unsigned issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ bool bundle_finished = *ret <= 0;
+ unsigned int cflags;
+
+ if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
+ cflags = io_put_kbuf(req, issue_flags);
+ goto finish;
+ }
+
+ cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags);
+
+ if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
+ goto finish;
+
+ /*
+ * Fill CQE for this receive and see if we should keep trying to
+ * receive from this socket.
+ */
+ if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
+ io_mshot_prep_retry(req, kmsg);
+ return false;
+ }
+
+ /* Otherwise stop bundle and use the current result. */
+finish:
+ io_req_set_res(req, *ret, cflags);
+ *ret = IOU_OK;
+ return true;
+}
+
int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -482,7 +566,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
struct socket *sock;
- unsigned int cflags;
unsigned flags;
int min_ret = 0;
int ret;
@@ -495,21 +578,47 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
+ flags = sr->msg_flags;
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+
+retry_bundle:
if (io_do_buffer_select(req)) {
- size_t len = sr->len;
- void __user *buf;
+ struct buf_sel_arg arg = {
+ .iovs = &kmsg->fast_iov,
+ .max_len = min_not_zero(sr->len, INT_MAX),
+ .nr_iovs = 1,
+ .mode = KBUF_MODE_EXPAND,
+ };
+
+ if (kmsg->free_iov) {
+ arg.nr_iovs = kmsg->free_iov_nr;
+ arg.iovs = kmsg->free_iov;
+ arg.mode |= KBUF_MODE_FREE;
+ }
- buf = io_buffer_select(req, &len, issue_flags);
- if (unlikely(!buf))
- return -ENOBUFS;
- sr->buf = buf;
- sr->len = len;
+ if (!(sr->flags & IORING_RECVSEND_BUNDLE))
+ arg.nr_iovs = 1;
+
+ ret = io_buffers_select(req, &arg, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+
+ sr->len = arg.out_len;
+ iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
+ arg.out_len);
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
+ kmsg->free_iov_nr = ret;
+ kmsg->free_iov = arg.iovs;
+ }
}
- flags = sr->msg_flags;
- if (issue_flags & IO_URING_F_NONBLOCK)
- flags |= MSG_DONTWAIT;
- if (flags & MSG_WAITALL)
+ /*
+ * If MSG_WAITALL is set, or this is a bundle send, then we need
+ * the full amount. If just bundle is set, if we do a short send
+ * then we complete the bundle sequence rather than continue on.
+ */
+ if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
@@ -534,10 +643,12 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
ret += sr->done_io;
else if (sr->done_io)
ret = sr->done_io;
+
+ if (!io_send_finish(req, &ret, kmsg, issue_flags))
+ goto retry_bundle;
+
io_req_msg_cleanup(req, issue_flags);
- cflags = io_put_kbuf(req, issue_flags);
- io_req_set_res(req, ret, cflags);
- return IOU_OK;
+ return ret;
}
static int io_recvmsg_mshot_prep(struct io_kiocb *req,