summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/uapi/linux/io_uring.h18
-rw-r--r--io_uring/kbuf.c42
-rw-r--r--io_uring/kbuf.h42
3 files changed, 82 insertions, 20 deletions
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 042eab793e26..a275f91d2ac0 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -440,11 +440,21 @@ struct io_uring_cqe {
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
* them from sends.
+ * IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
+ * more completions. In other words, the buffer is being
+ * partially consumed, and will be used by the kernel for
+ * more completions. This is only set for buffers used via
+ * the incremental buffer consumption, as provided by
+ * a ring buffer setup with IOU_PBUF_RING_INC. For any
+ * other provided buffer type, all completions with a
+ * buffer passed back is automatically returned to the
+ * application.
*/
#define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
#define IORING_CQE_F_NOTIF (1U << 3)
+#define IORING_CQE_F_BUF_MORE (1U << 4)
#define IORING_CQE_BUFFER_SHIFT 16
@@ -716,9 +726,17 @@ struct io_uring_buf_ring {
* mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
+ * IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be
+ * consumed incrementally. Normally one (or more) buffers
+ * are fully consumed. With incremental consumptions, it's
+ * feasible to register big ranges of buffers, and each
+ * use of it will consume only as much as it needs. This
+ * requires that both the kernel and application keep
+ * track of where the current read/recv index is at.
*/
enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_MMAP = 1,
+ IOU_PBUF_RING_INC = 2,
};
/* argument for IORING_(UN)REGISTER_PBUF_RING */
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 55d01861d8c5..1f503bcc9c9f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -212,14 +212,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
buf = io_ring_head_to_buf(br, head, bl->mask);
if (arg->max_len) {
u32 len = READ_ONCE(buf->len);
- size_t needed;
if (unlikely(!len))
return -ENOBUFS;
- needed = (arg->max_len + len - 1) / len;
- needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
- if (nr_avail > needed)
- nr_avail = needed;
+ /*
+ * Limit incremental buffers to 1 segment. No point trying
+ * to peek ahead and map more than we need, when the buffers
+ * themselves should be large when setup with
+ * IOU_PBUF_RING_INC.
+ */
+ if (bl->flags & IOBL_INC) {
+ nr_avail = 1;
+ } else {
+ size_t needed;
+
+ needed = (arg->max_len + len - 1) / len;
+ needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
+ if (nr_avail > needed)
+ nr_avail = needed;
+ }
}
/*
@@ -244,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
req->buf_index = buf->bid;
do {
- /* truncate end piece, if needed */
- if (buf->len > arg->max_len)
- buf->len = arg->max_len;
+ u32 len = buf->len;
+
+ /* truncate end piece, if needed, for non partial buffers */
+ if (len > arg->max_len) {
+ len = arg->max_len;
+ if (!(bl->flags & IOBL_INC))
+ buf->len = len;
+ }
iov->iov_base = u64_to_user_ptr(buf->addr);
- iov->iov_len = buf->len;
+ iov->iov_len = len;
iov++;
- arg->out_len += buf->len;
- arg->max_len -= buf->len;
+ arg->out_len += len;
+ arg->max_len -= len;
if (!arg->max_len)
break;
@@ -675,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
- if (reg.flags & ~IOU_PBUF_RING_MMAP)
+ if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
@@ -713,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!ret) {
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
+ if (reg.flags & IOU_PBUF_RING_INC)
+ bl->flags |= IOBL_INC;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index b41e2a0a0505..36aadfe5ac00 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -9,6 +9,9 @@ enum {
IOBL_BUF_RING = 1,
/* ring mapped provided buffers, but mmap'ed by application */
IOBL_MMAP = 2,
+ /* buffers are consumed incrementally rather than always fully */
+ IOBL_INC = 4,
+
};
struct io_buffer_list {
@@ -124,24 +127,45 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
/* Mapped buffer ring, return io_uring_buf from head */
#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)]
-static inline void io_kbuf_commit(struct io_kiocb *req,
+static inline bool io_kbuf_commit(struct io_kiocb *req,
struct io_buffer_list *bl, int len, int nr)
{
if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT)))
- return;
- bl->head += nr;
+ return true;
+
req->flags &= ~REQ_F_BUFFERS_COMMIT;
+
+ if (unlikely(len < 0))
+ return true;
+
+ if (bl->flags & IOBL_INC) {
+ struct io_uring_buf *buf;
+
+ buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
+ if (WARN_ON_ONCE(len > buf->len))
+ len = buf->len;
+ buf->len -= len;
+ if (buf->len) {
+ buf->addr += len;
+ return false;
+ }
+ }
+
+ bl->head += nr;
+ return true;
}
-static inline void __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
+static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
{
struct io_buffer_list *bl = req->buf_list;
+ bool ret = true;
if (bl) {
- io_kbuf_commit(req, bl, len, nr);
+ ret = io_kbuf_commit(req, bl, len, nr);
req->buf_index = bl->bgid;
}
req->flags &= ~REQ_F_BUFFER_RING;
+ return ret;
}
static inline void __io_put_kbuf_list(struct io_kiocb *req, int len,
@@ -176,10 +200,12 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len,
return 0;
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
- if (req->flags & REQ_F_BUFFER_RING)
- __io_put_kbuf_ring(req, len, nbufs);
- else
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (!__io_put_kbuf_ring(req, len, nbufs))
+ ret |= IORING_CQE_F_BUF_MORE;
+ } else {
__io_put_kbuf(req, len, issue_flags);
+ }
return ret;
}