From 6f0974eccbf78baead1735722c4f1ee3eb9422cd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 Mar 2024 13:30:43 -0600
Subject: io_uring: don't save/restore iowait state

This kind of state is per-syscall, and since we're doing the waiting off
entering the io_uring_enter(2) syscall, there's no way that iowait can
already be set for this case. Simplify it by setting it if we need to,
and always clearing it to 0 when done.

Fixes: 7b72d661f1f2 ("io_uring: gate iowait schedule on having pending requests")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cf348c33f485..49a124daa359 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2539,7 +2539,7 @@ static bool current_pending_io(void)
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq)
 {
-	int io_wait, ret;
+	int ret;
 
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
@@ -2557,7 +2557,6 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 * can take into account that the task is waiting for IO - turns out
 	 * to be important for low QD IO.
 	 */
-	io_wait = current->in_iowait;
 	if (current_pending_io())
 		current->in_iowait = 1;
 	ret = 0;
@@ -2565,7 +2564,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 		schedule();
 	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
 		ret = -ETIME;
-	current->in_iowait = io_wait;
+	current->in_iowait = 0;
 	return ret;
 }
 
-- 
cgit v1.2.3-58-ga151


From 0a3737db8479b77f95f4bfda8e71b03c697eb56a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 12 Mar 2024 08:29:47 -0600
Subject: io_uring/rw: return IOU_ISSUE_SKIP_COMPLETE for multishot retry

If read multishot is being invoked from the poll retry handler, then we
should return IOU_ISSUE_SKIP_COMPLETE rather than -EAGAIN. If not, then
a CQE will be posted with -EAGAIN rather than triggering the retry when
the file is flagged as readable again.

Cc: stable@vger.kernel.org
Reported-by: Sargun Dhillon <sargun@meta.com>
Fixes: fc68fcda04910 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/rw.c b/io_uring/rw.c
index 47e097ab5d7e..0585ebcc9773 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -947,6 +947,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
 		 */
 		if (io_kbuf_recycle(req, issue_flags))
 			rw->len = 0;
+		if (issue_flags & IO_URING_F_MULTISHOT)
+			return IOU_ISSUE_SKIP_COMPLETE;
 		return -EAGAIN;
 	}
 
-- 
cgit v1.2.3-58-ga151


From cef59d1ea7170ec753182302645a0191c8aa3382 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 12 Mar 2024 14:56:27 +0000
Subject: io_uring: clean rings on NO_MMAP alloc fail

We make a few cancellation judgements based on ctx->rings, so let's
zero it afer deallocation for IORING_SETUP_NO_MMAP just like it's
done with the mmap case. Likely, it's not a real problem, but zeroing
is safer and better tested.

Cc: stable@vger.kernel.org
Fixes: 03d89a2de25bbc ("io_uring: support for user allocated memory for rings/sqes")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/9ff6cdf91429b8a51699c210e1f6af6ea3f8bdcf.1710255382.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 49a124daa359..e7d7a456b489 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2788,14 +2788,15 @@ static void io_rings_free(struct io_ring_ctx *ctx)
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
 		io_mem_free(ctx->rings);
 		io_mem_free(ctx->sq_sqes);
-		ctx->rings = NULL;
-		ctx->sq_sqes = NULL;
 	} else {
 		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
 		ctx->n_ring_pages = 0;
 		io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
 		ctx->n_sqe_pages = 0;
 	}
+
+	ctx->rings = NULL;
+	ctx->sq_sqes = NULL;
 }
 
 void *io_mem_alloc(size_t size)
-- 
cgit v1.2.3-58-ga151


From 2c5c0ba1179d31b0a030b45a16df6181d1bc3ea6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 13 Mar 2024 15:52:40 +0000
Subject: io_uring: simplify io_pages_free

We never pass a null (top-level) pointer, remove the check.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e1a46f9a5cd38e6876905e8030bdff9b0845e96.1710343154.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e7d7a456b489..48c8d74e86ab 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2696,13 +2696,9 @@ void io_mem_free(void *ptr)
 
 static void io_pages_free(struct page ***pages, int npages)
 {
-	struct page **page_array;
+	struct page **page_array = *pages;
 	int i;
 
-	if (!pages)
-		return;
-
-	page_array = *pages;
 	if (!page_array)
 		return;
 
-- 
cgit v1.2.3-58-ga151


From 9219e4a9d4ad57323837f7c3562964e61840b17a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 13 Mar 2024 15:52:41 +0000
Subject: io_uring/kbuf: rename is_mapped

In buffer lists we have ->is_mapped as well as ->is_mmap, it's
pretty hard to stay sane double checking which one means what,
and in the long run there is a high chance of an eventual bug.
Rename ->is_mapped into ->is_buf_ring.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c4838f4d8ad506ad6373f1c305aee2d2c1a89786.1710343154.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 20 ++++++++++----------
 io_uring/kbuf.h |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 9be42bff936b..693c26da4ee1 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -199,7 +199,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 
 	bl = io_buffer_get_list(ctx, req->buf_index);
 	if (likely(bl)) {
-		if (bl->is_mapped)
+		if (bl->is_buf_ring)
 			ret = io_ring_buffer_select(req, len, bl, issue_flags);
 		else
 			ret = io_provided_buffer_select(req, len, bl);
@@ -253,7 +253,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (!nbufs)
 		return 0;
 
-	if (bl->is_mapped) {
+	if (bl->is_buf_ring) {
 		i = bl->buf_ring->tail - bl->head;
 		if (bl->is_mmap) {
 			/*
@@ -274,7 +274,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 		}
 		/* make sure it's seen as empty */
 		INIT_LIST_HEAD(&bl->buf_list);
-		bl->is_mapped = 0;
+		bl->is_buf_ring = 0;
 		return i;
 	}
 
@@ -361,7 +361,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	if (bl) {
 		ret = -EINVAL;
 		/* can't use provide/remove buffers command on mapped buffers */
-		if (!bl->is_mapped)
+		if (!bl->is_buf_ring)
 			ret = __io_remove_buffers(ctx, bl, p->nbufs);
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
@@ -519,7 +519,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 		}
 	}
 	/* can't add buffers via this command for a mapped buffer ring */
-	if (bl->is_mapped) {
+	if (bl->is_buf_ring) {
 		ret = -EINVAL;
 		goto err;
 	}
@@ -575,7 +575,7 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	bl->buf_pages = pages;
 	bl->buf_nr_pages = nr_pages;
 	bl->buf_ring = br;
-	bl->is_mapped = 1;
+	bl->is_buf_ring = 1;
 	bl->is_mmap = 0;
 	return 0;
 error_unpin:
@@ -642,7 +642,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
 	}
 	ibf->inuse = 1;
 	bl->buf_ring = ibf->mem;
-	bl->is_mapped = 1;
+	bl->is_buf_ring = 1;
 	bl->is_mmap = 1;
 	return 0;
 }
@@ -688,7 +688,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	bl = io_buffer_get_list(ctx, reg.bgid);
 	if (bl) {
 		/* if mapped buffer ring OR classic exists, don't allow */
-		if (bl->is_mapped || !list_empty(&bl->buf_list))
+		if (bl->is_buf_ring || !list_empty(&bl->buf_list))
 			return -EEXIST;
 	} else {
 		free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
@@ -730,7 +730,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	bl = io_buffer_get_list(ctx, reg.bgid);
 	if (!bl)
 		return -ENOENT;
-	if (!bl->is_mapped)
+	if (!bl->is_buf_ring)
 		return -EINVAL;
 
 	__io_remove_buffers(ctx, bl, -1U);
@@ -757,7 +757,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
 	bl = io_buffer_get_list(ctx, buf_status.buf_group);
 	if (!bl)
 		return -ENOENT;
-	if (!bl->is_mapped)
+	if (!bl->is_buf_ring)
 		return -EINVAL;
 
 	buf_status.head = bl->head;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 5218bfd79e87..1c7b654ee726 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -26,7 +26,7 @@ struct io_buffer_list {
 	__u16 mask;
 
 	/* ring mapped provided buffers */
-	__u8 is_mapped;
+	__u8 is_buf_ring;
 	/* ring mapped provided buffers, but mmap'ed by application */
 	__u8 is_mmap;
 	/* bl is visible from an RCU point of view for lookup */
-- 
cgit v1.2.3-58-ga151


From 67d1189d1095d471ed7fa426c7e384a7140a5dd7 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Wed, 13 Mar 2024 17:39:12 -0400
Subject: io_uring: Fix release of pinned pages when __io_uaddr_map fails

Looking at the error path of __io_uaddr_map, if we fail after pinning
the pages for any reasons, ret will be set to -EINVAL and the error
handler won't properly release the pinned pages.

I didn't manage to trigger it without forcing a failure, but it can
happen in real life when memory is heavily fragmented.

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Fixes: 223ef4743164 ("io_uring: don't allow IORING_SETUP_NO_MMAP rings on highmem pages")
Link: https://lore.kernel.org/r/20240313213912.1920-1-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 48c8d74e86ab..3ae4bb988906 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2714,7 +2714,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
 	struct page **page_array;
 	unsigned int nr_pages;
 	void *page_addr;
-	int ret, i;
+	int ret, i, pinned;
 
 	*npages = 0;
 
@@ -2728,12 +2728,12 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
 	if (!page_array)
 		return ERR_PTR(-ENOMEM);
 
-	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
-					page_array);
-	if (ret != nr_pages) {
-err:
-		io_pages_free(&page_array, ret > 0 ? ret : 0);
-		return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
+
+	pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+				     page_array);
+	if (pinned != nr_pages) {
+		ret = (pinned < 0) ? pinned : -EFAULT;
+		goto free_pages;
 	}
 
 	page_addr = page_address(page_array[0]);
@@ -2747,7 +2747,7 @@ err:
 		 * didn't support this feature.
 		 */
 		if (PageHighMem(page_array[i]))
-			goto err;
+			goto free_pages;
 
 		/*
 		 * No support for discontig pages for now, should either be a
@@ -2756,13 +2756,17 @@ err:
 		 * just fail them with EINVAL.
 		 */
 		if (page_address(page_array[i]) != page_addr)
-			goto err;
+			goto free_pages;
 		page_addr += PAGE_SIZE;
 	}
 
 	*pages = page_array;
 	*npages = nr_pages;
 	return page_to_virt(page_array[0]);
+
+free_pages:
+	io_pages_free(&page_array, pinned > 0 ? pinned : 0);
+	return ERR_PTR(ret);
 }
 
 static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
-- 
cgit v1.2.3-58-ga151


From 5e3afe580a9f5ca173a6bd55ffe10948796ef7e5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Mar 2024 15:29:51 +0000
Subject: io_uring: fix poll_remove stalled req completion

Taking the ctx lock is not enough to use the deferred request completion
infrastructure, it'll get queued into the list but no one would expect
it there, so it will sit there until next io_submit_flush_completions().
It's hard to care about the cancellation path, so complete it via tw.

Fixes: ef7dfac51d8ed ("io_uring/poll: serialize poll linked timer start with poll removal")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c446740bc16858f8a2a8dcdce899812f21d15f23.1710514702.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index 5f779139cae1..6db1dcb2c797 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -996,7 +996,6 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_hash_bucket *bucket;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
-	struct io_tw_state ts = { .locked = true };
 
 	io_ring_submit_lock(ctx, issue_flags);
 	preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
@@ -1045,7 +1044,8 @@ found:
 
 	req_set_fail(preq);
 	io_req_set_res(preq, -ECANCELED, 0);
-	io_req_task_complete(preq, &ts);
+	preq->io_task_work.func = io_req_task_complete;
+	io_req_task_work_add(preq);
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	if (ret < 0) {
-- 
cgit v1.2.3-58-ga151


From 30dab608c3cb99c2a05b76289fd05551703979ae Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Mar 2024 15:37:15 -0600
Subject: io_uring/futex: always remove futex entry for cancel all

We know the request is either being removed, or already in the process of
being removed through task_work, so we can delete it from our futex list
upfront. This is important for remove all conditions, as we otherwise
will find it multiple times and prevent cancelation progress.

Cc: stable@vger.kernel.org
Fixes: 194bb58c6090 ("io_uring: add support for futex wake and wait")
Fixes: 8f350194d5cf ("io_uring: add support for vectored futex waits")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/futex.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/io_uring/futex.c b/io_uring/futex.c
index 3c3575303c3d..792a03df58de 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -159,6 +159,7 @@ bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
 	hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
 		if (!io_match_task_safe(req, task, cancel_all))
 			continue;
+		hlist_del_init(&req->hash_node);
 		__io_futex_cancel(ctx, req);
 		found = true;
 	}
-- 
cgit v1.2.3-58-ga151


From 2b35b8b43e07b1a6f06fdd84cf4b9eb24785896d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Mar 2024 15:42:49 -0600
Subject: io_uring/waitid: always remove waitid entry for cancel all

We know the request is either being removed, or already in the process of
being removed through task_work, so we can delete it from our waitid list
upfront. This is important for remove all conditions, as we otherwise
will find it multiple times and prevent cancelation progress.

Remove the dead check in cancelation as well for the hash_node being
empty or not. We already have a waitid reference check for ownership,
so we don't need to check the list too.

Cc: stable@vger.kernel.org
Fixes: f31ecf671ddc ("io_uring: add IORING_OP_WAITID support")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/waitid.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 6f851978606d..77d340666cb9 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -125,12 +125,6 @@ static void io_waitid_complete(struct io_kiocb *req, int ret)
 
 	lockdep_assert_held(&req->ctx->uring_lock);
 
-	/*
-	 * Did cancel find it meanwhile?
-	 */
-	if (hlist_unhashed(&req->hash_node))
-		return;
-
 	hlist_del_init(&req->hash_node);
 
 	ret = io_waitid_finish(req, ret);
@@ -202,6 +196,7 @@ bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
 	hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
 		if (!io_match_task_safe(req, task, cancel_all))
 			continue;
+		hlist_del_init(&req->hash_node);
 		__io_waitid_cancel(ctx, req);
 		found = true;
 	}
-- 
cgit v1.2.3-58-ga151


From f3a640cca951ef9715597e68f5363afc0f452a88 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 15 Mar 2024 16:36:23 -0600
Subject: io_uring/net: ensure async prep handlers always initialize ->done_io

If we get a request with IOSQE_ASYNC set, then we first run the prep
async handlers. But if we then fail setting it up and want to post
a CQE with -EINVAL, we use ->done_io. This was previously guarded with
REQ_F_PARTIAL_IO, and the normal setup handlers do set it up before any
potential errors, but we need to cover the async setup too.

Fixes: 9817ad85899f ("io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/net.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/io_uring/net.c b/io_uring/net.c
index 19451f0dbf81..1e7665ff6ef7 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -326,7 +326,10 @@ int io_send_prep_async(struct io_kiocb *req)
 	struct io_async_msghdr *io;
 	int ret;
 
-	if (!zc->addr || req_has_async_data(req))
+	if (req_has_async_data(req))
+		return 0;
+	zc->done_io = 0;
+	if (!zc->addr)
 		return 0;
 	io = io_msg_alloc_async_prep(req);
 	if (!io)
@@ -353,8 +356,10 @@ static int io_setup_async_addr(struct io_kiocb *req,
 
 int io_sendmsg_prep_async(struct io_kiocb *req)
 {
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	int ret;
 
+	sr->done_io = 0;
 	if (!io_msg_alloc_async_prep(req))
 		return -ENOMEM;
 	ret = io_sendmsg_copy_hdr(req, req->async_data);
@@ -608,9 +613,11 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 
 int io_recvmsg_prep_async(struct io_kiocb *req)
 {
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr *iomsg;
 	int ret;
 
+	sr->done_io = 0;
 	if (!io_msg_alloc_async_prep(req))
 		return -ENOMEM;
 	iomsg = req->async_data;
-- 
cgit v1.2.3-58-ga151


From e21e1c45e1fe2e31732f40256b49c04e76a17cee Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 16 Mar 2024 09:51:40 -0600
Subject: io_uring: clear opcode specific data for an early failure

If failure happens before the opcode prep handler is called, ensure that
we clear the opcode specific area of the request, which holds data
specific to that request type. This prevents errors where opcode
handlers either don't get to clear per-request private data since prep
isn't even called.

Reported-and-tested-by: syzbot+f8e9a371388aa62ecab4@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3ae4bb988906..5d4b448fdc50 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2181,6 +2181,13 @@ static void io_init_req_drain(struct io_kiocb *req)
 	}
 }
 
+static __cold int io_init_fail_req(struct io_kiocb *req, int err)
+{
+	/* ensure per-opcode data is cleared if we fail before prep */
+	memset(&req->cmd.data, 0, sizeof(req->cmd.data));
+	return err;
+}
+
 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 		       const struct io_uring_sqe *sqe)
 	__must_hold(&ctx->uring_lock)
@@ -2202,29 +2209,29 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 	if (unlikely(opcode >= IORING_OP_LAST)) {
 		req->opcode = 0;
-		return -EINVAL;
+		return io_init_fail_req(req, -EINVAL);
 	}
 	def = &io_issue_defs[opcode];
 	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
 		/* enforce forwards compatibility on users */
 		if (sqe_flags & ~SQE_VALID_FLAGS)
-			return -EINVAL;
+			return io_init_fail_req(req, -EINVAL);
 		if (sqe_flags & IOSQE_BUFFER_SELECT) {
 			if (!def->buffer_select)
-				return -EOPNOTSUPP;
+				return io_init_fail_req(req, -EOPNOTSUPP);
 			req->buf_index = READ_ONCE(sqe->buf_group);
 		}
 		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
 			ctx->drain_disabled = true;
 		if (sqe_flags & IOSQE_IO_DRAIN) {
 			if (ctx->drain_disabled)
-				return -EOPNOTSUPP;
+				return io_init_fail_req(req, -EOPNOTSUPP);
 			io_init_req_drain(req);
 		}
 	}
 	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
 		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
-			return -EACCES;
+			return io_init_fail_req(req, -EACCES);
 		/* knock it to the slow queue path, will be drained there */
 		if (ctx->drain_active)
 			req->flags |= REQ_F_FORCE_ASYNC;
@@ -2237,9 +2244,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	}
 
 	if (!def->ioprio && sqe->ioprio)
-		return -EINVAL;
+		return io_init_fail_req(req, -EINVAL);
 	if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
-		return -EINVAL;
+		return io_init_fail_req(req, -EINVAL);
 
 	if (def->needs_file) {
 		struct io_submit_state *state = &ctx->submit_state;
@@ -2263,12 +2270,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 
 		req->creds = xa_load(&ctx->personalities, personality);
 		if (!req->creds)
-			return -EINVAL;
+			return io_init_fail_req(req, -EINVAL);
 		get_cred(req->creds);
 		ret = security_uring_override_creds(req->creds);
 		if (ret) {
 			put_cred(req->creds);
-			return ret;
+			return io_init_fail_req(req, ret);
 		}
 		req->flags |= REQ_F_CREDS;
 	}
-- 
cgit v1.2.3-58-ga151


From 1251d2025c3e1bcf1f17ec0f3c0dfae5e5bbb146 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Mar 2024 20:22:42 -0600
Subject: io_uring/sqpoll: early exit thread if task_context wasn't allocated

Ideally we'd want to simply kill the task rather than wake it, but for
now let's just add a startup check that causes the thread to exit.
This can only happen if io_uring_alloc_task_context() fails, which
generally requires fault injection.

Reported-by: Ubisectech Sirius <bugreport@ubisectech.com>
Fixes: af5d68f8892f ("io_uring/sqpoll: manage task_work privately")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/sqpoll.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 363052b4ea76..3983708cef5b 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -274,6 +274,10 @@ static int io_sq_thread(void *data)
 	char buf[TASK_COMM_LEN];
 	DEFINE_WAIT(wait);
 
+	/* offload context creation failed, just exit */
+	if (!current->io_uring)
+		goto err_out;
+
 	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
 	set_task_comm(current, buf);
 
@@ -371,7 +375,7 @@ static int io_sq_thread(void *data)
 		atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
 	io_run_task_work();
 	mutex_unlock(&sqd->lock);
-
+err_out:
 	complete(&sqd->exited);
 	do_exit(0);
 }
-- 
cgit v1.2.3-58-ga151