diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-26 12:30:26 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-06-26 12:30:26 -0700 |
commit | 0aa69d53ac7c30f6184f88f2e310d808b32b35a5 (patch) | |
tree | 26a666a128578dd1c46f39b432543411caa0d6eb | |
parent | 3eccc0c886b1796f95a289c9d127c8ca1a254bd5 (diff) | |
parent | c98c81a4ac37b651be7eb9d16f562fc4acc5f867 (diff) |
Merge tag 'for-6.5/io_uring-2023-06-23' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
"Nothing major in this release, just a bunch of cleanups and some
optimizations around networking mostly.
- clean up file request flags handling (Christoph)
- clean up request freeing and CQ locking (Pavel)
- support for using pre-registering the io_uring fd at setup time
(Josh)
- Add support for user allocated ring memory, rather than having the
kernel allocate it. Mostly for packing rings into a huge page (me)
- avoid an unnecessary double retry on receive (me)
- maintain ordering for task_work, which also improves performance
(me)
- misc cleanups/fixes (Pavel, me)"
* tag 'for-6.5/io_uring-2023-06-23' of git://git.kernel.dk/linux: (39 commits)
io_uring: merge conditional unlock flush helpers
io_uring: make io_cq_unlock_post static
io_uring: inline __io_cq_unlock
io_uring: fix acquire/release annotations
io_uring: kill io_cq_unlock()
io_uring: remove IOU_F_TWQ_FORCE_NORMAL
io_uring: don't batch task put on reqs free
io_uring: move io_clean_op()
io_uring: inline io_dismantle_req()
io_uring: remove io_free_req_tw
io_uring: open code io_put_req_find_next
io_uring: add helpers to decode the fixed file file_ptr
io_uring: use io_file_from_index in io_msg_grab_file
io_uring: use io_file_from_index in __io_sync_cancel
io_uring: return REQ_F_ flags from io_file_get_flags
io_uring: remove io_req_ffs_set
io_uring: remove a confusing comment above io_file_get_flags
io_uring: remove the mode variable in io_file_get_flags
io_uring: remove __io_file_supports_nowait
io_uring: wait interruptibly for request completions on exit
...
-rw-r--r-- | block/fops.c | 5 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 4 | ||||
-rw-r--r-- | include/linux/io_uring.h | 18 | ||||
-rw-r--r-- | include/linux/io_uring_types.h | 10 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 16 | ||||
-rw-r--r-- | io_uring/cancel.c | 5 | ||||
-rw-r--r-- | io_uring/filetable.c | 11 | ||||
-rw-r--r-- | io_uring/filetable.h | 28 | ||||
-rw-r--r-- | io_uring/io_uring.c | 497 | ||||
-rw-r--r-- | io_uring/io_uring.h | 17 | ||||
-rw-r--r-- | io_uring/msg_ring.c | 4 | ||||
-rw-r--r-- | io_uring/net.c | 58 | ||||
-rw-r--r-- | io_uring/poll.c | 6 | ||||
-rw-r--r-- | io_uring/poll.h | 2 | ||||
-rw-r--r-- | io_uring/rsrc.c | 8 | ||||
-rw-r--r-- | io_uring/rw.c | 6 | ||||
-rw-r--r-- | io_uring/rw.h | 1 | ||||
-rw-r--r-- | io_uring/tctx.c | 31 | ||||
-rw-r--r-- | io_uring/timeout.c | 6 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 16 | ||||
-rw-r--r-- | net/socket.c | 1 |
21 files changed, 416 insertions, 334 deletions
diff --git a/block/fops.c b/block/fops.c index 18dedb730493..4faeada05b23 100644 --- a/block/fops.c +++ b/block/fops.c @@ -481,7 +481,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; @@ -494,6 +494,9 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (IS_ERR(bdev)) return PTR_ERR(bdev); + if (bdev_nowait(bdev)) + filp->f_mode |= FMODE_NOWAIT; + filp->private_data = bdev; filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index f15e7330b75a..b97744e05551 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -521,7 +521,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, if (cookie != NULL && blk_rq_is_poll(req)) nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); else - io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb); + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); return RQ_END_IO_FREE; } @@ -543,7 +543,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, if (cookie != NULL && blk_rq_is_poll(req)) nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); else - io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_meta_cb); + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); return RQ_END_IO_NONE; } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 7fe31b2cd02f..bb9c666bd584 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -46,13 +46,23 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, unsigned issue_flags); -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags); +/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)); + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} static inline void io_uring_files_cancel(void) { @@ -85,6 +95,10 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { } +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ +} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1b2a20a42413..f04ce513fadb 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -211,6 +211,16 @@ struct io_ring_ctx { unsigned int compat: 1; enum task_work_notify_mode notify_method; + + /* + * If IORING_SETUP_NO_MMAP is used, then the below holds + * the gup'ed pages for the two rings, and the sqes. + */ + unsigned short n_ring_pages; + unsigned short n_sqe_pages; + struct page **ring_pages; + struct page **sqe_pages; + struct io_rings *rings; struct task_struct *submitter_task; struct percpu_ref refs; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0716cb17e436..f222d263bc55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -173,6 +173,18 @@ enum { */ #define IORING_SETUP_DEFER_TASKRUN (1U << 13) +/* + * Application provides the memory for the rings + */ +#define IORING_SETUP_NO_MMAP (1U << 14) + +/* + * Register the ring fd in itself for use with + * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + * than an fd. + */ +#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -406,7 +418,7 @@ struct io_sqring_offsets { __u32 dropped; __u32 array; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* @@ -425,7 +437,7 @@ struct io_cqring_offsets { __u32 cqes; __u32 flags; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* diff --git a/io_uring/cancel.c b/io_uring/cancel.c index b4f5dfacc0c3..58c46c852bdd 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -216,13 +216,10 @@ static int __io_sync_cancel(struct io_uring_task *tctx, /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { - unsigned long file_ptr; - if (unlikely(fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - cd->file = (struct file *) (file_ptr & FFS_MASK); + cd->file = io_file_from_index(&ctx->file_table, fd); if (!cd->file) return -EBADF; } diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 0f6fa791a47d..e7d749991de4 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -78,10 +78,8 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); if (file_slot->file_ptr) { - struct file *old_file; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file); + ret = io_queue_rsrc_removal(ctx->file_data, slot_index, + io_slot_file(file_slot)); if (ret) return ret; @@ -140,7 +138,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) { struct io_fixed_file *file_slot; - struct file *file; int ret; if (unlikely(!ctx->file_data)) @@ -153,8 +150,8 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) if (!file_slot->file_ptr) return -EBADF; - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, file); + ret = io_queue_rsrc_removal(ctx->file_data, offset, + io_slot_file(file_slot)); if (ret) return ret; diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 351111ff8882..b47adf170c31 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -5,10 +5,6 @@ #include <linux/file.h> #include <linux/io_uring_types.h> -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) - bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); @@ -43,21 +39,31 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i) return &table->files[i]; } +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) + +static inline unsigned int io_slot_flags(struct io_fixed_file *slot) +{ + return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; +} + +static inline struct file *io_slot_file(struct io_fixed_file *slot) +{ + return (struct file *)(slot->file_ptr & FFS_MASK); +} + static inline struct file *io_file_from_index(struct io_file_table *table, int index) { - struct io_fixed_file *slot = io_fixed_file_slot(table, index); - - return (struct file *) (slot->file_ptr & FFS_MASK); + return io_slot_file(io_fixed_file_slot(table, index)); } static inline void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) { - unsigned long file_ptr = (unsigned long) file; - - file_ptr |= io_file_get_flags(file); - file_slot->file_ptr = file_ptr; + file_slot->file_ptr = (unsigned long)file | + (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT); } static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3bca7a79efda..1b53a2ab0a27 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "timeout.h" #include "poll.h" +#include "rw.h" #include "alloc_cache.h" #define IORING_MAX_ENTRIES 32768 @@ -145,8 +146,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -static void io_dismantle_req(struct io_kiocb *req); -static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); static void io_move_task_work_from_local(struct io_ring_ctx *ctx); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -367,6 +366,39 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } +static void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & REQ_F_BUFFER_SELECTED) { + spin_lock(&req->ctx->completion_lock); + io_put_kbuf_comp(req); + spin_unlock(&req->ctx->completion_lock); + } + + if (req->flags & REQ_F_NEED_CLEANUP) { + const struct io_cold_def *def = &io_cold_defs[req->opcode]; + + if (def->cleanup) + def->cleanup(req); + } + if ((req->flags & REQ_F_POLLED) && req->apoll) { + kfree(req->apoll->double_poll); + kfree(req->apoll); + req->apoll = NULL; + } + if (req->flags & REQ_F_INFLIGHT) { + struct io_uring_task *tctx = req->task->io_uring; + + atomic_dec(&tctx->inflight_tracked); + } + if (req->flags & REQ_F_CREDS) + put_cred(req->creds); + if (req->flags & REQ_F_ASYNC_DATA) { + kfree(req->async_data); + req->async_data = NULL; + } + req->flags &= ~IO_REQ_CLEAN_FLAGS; +} + static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { @@ -423,8 +455,8 @@ static void io_prep_async_work(struct io_kiocb *req) if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; - if (req->file && !io_req_ffs_set(req)) - req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) + req->flags |= io_file_get_flags(req->file); if (req->file && (req->flags & REQ_F_ISREG)) { bool should_hash = def->hash_reg_file; @@ -594,42 +626,18 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) } static inline void __io_cq_lock(struct io_ring_ctx *ctx) - __acquires(ctx->completion_lock) { if (!ctx->task_complete) spin_lock(&ctx->completion_lock); } -static inline void __io_cq_unlock(struct io_ring_ctx *ctx) -{ - if (!ctx->task_complete) - spin_unlock(&ctx->completion_lock); -} - static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { spin_lock(&ctx->completion_lock); } -static inline void io_cq_unlock(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) -{ - spin_unlock(&ctx->completion_lock); -} - -/* keep it inlined for io_submit_flush_completions() */ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) -{ - io_commit_cqring(ctx); - __io_cq_unlock(ctx); - io_commit_cqring_flush(ctx); - io_cqring_wake(ctx); -} - -static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) { io_commit_cqring(ctx); @@ -641,13 +649,13 @@ static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) */ io_commit_cqring_flush(ctx); } else { - __io_cq_unlock(ctx); + spin_unlock(&ctx->completion_lock); io_commit_cqring_flush(ctx); io_cqring_wake(ctx); } } -void io_cq_unlock_post(struct io_ring_ctx *ctx) +static void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); @@ -662,10 +670,10 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) struct io_overflow_cqe *ocqe; LIST_HEAD(list); - io_cq_lock(ctx); + spin_lock(&ctx->completion_lock); list_splice_init(&ctx->cq_overflow_list, &list); clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - io_cq_unlock(ctx); + spin_unlock(&ctx->completion_lock); while (!list_empty(&list)) { ocqe = list_first_entry(&list, struct io_overflow_cqe, list); @@ -722,29 +730,29 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) } /* can be called by any task */ -static void io_put_task_remote(struct task_struct *task, int nr) +static void io_put_task_remote(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; - percpu_counter_sub(&tctx->inflight, nr); + percpu_counter_sub(&tctx->inflight, 1); if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); - put_task_struct_many(task, nr); + put_task_struct(task); } /* used by a task to put its own references */ -static void io_put_task_local(struct task_struct *task, int nr) +static void io_put_task_local(struct task_struct *task) { - task->io_uring->cached_refs += nr; + task->io_uring->cached_refs++; } /* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) +static inline void io_put_task(struct task_struct *task) { if (likely(task == current)) - io_put_task_local(task, nr); + io_put_task_local(task); else - io_put_task_remote(task, nr); + io_put_task_remote(task); } void io_task_refs_refill(struct io_uring_task *tctx) @@ -934,20 +942,19 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags return __io_post_aux_cqe(ctx, user_data, res, cflags, true); } -bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, +bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, bool allow_overflow) { + struct io_ring_ctx *ctx = req->ctx; + u64 user_data = req->cqe.user_data; struct io_uring_cqe *cqe; - unsigned int length; if (!defer) return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); - length = ARRAY_SIZE(ctx->submit_state.cqes); - lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == length) { + if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) { __io_cq_lock(ctx); __io_flush_post_cqes(ctx); /* no need to flush - flush is deferred */ @@ -991,14 +998,18 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) } } io_put_kbuf_comp(req); - io_dismantle_req(req); + if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) + io_clean_op(req); + if (!(req->flags & REQ_F_FIXED_FILE)) + io_put_file(req->file); + rsrc_node = req->rsrc_node; /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid * deadlocks. */ - io_put_task_remote(req->task, 1); + io_put_task_remote(req->task); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } @@ -1111,36 +1122,13 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) return true; } -static inline void io_dismantle_req(struct io_kiocb *req) -{ - unsigned int flags = req->flags; - - if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - if (!(flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); -} - -static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (req->rsrc_node) { - io_tw_lock(ctx, ts); - io_put_rsrc_node(ctx, req->rsrc_node); - } - io_dismantle_req(req); - io_put_task_remote(req->task, 1); - - spin_lock(&ctx->completion_lock); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - spin_unlock(&ctx->completion_lock); -} - __cold void io_free_req(struct io_kiocb *req) { - req->io_task_work.func = io_free_req_tw; + /* refs were already put, restore them for io_req_task_complete() */ + req->flags &= ~REQ_F_REFCOUNT; + /* we only want to free it, don't post CQEs */ + req->flags |= REQ_F_CQE_SKIP; + req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } @@ -1205,7 +1193,9 @@ static unsigned int handle_tw_list(struct llist_node *node, ts->locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); } - req->io_task_work.func(req, ts); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); node = next; count++; if (unlikely(need_resched())) { @@ -1303,7 +1293,7 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx) } } -static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) +static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; @@ -1354,19 +1344,11 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } -void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +static void io_req_normal_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - if (!(flags & IOU_F_TWQ_FORCE_NORMAL) && - (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { - rcu_read_lock(); - io_req_local_work_add(req, flags); - rcu_read_unlock(); - return; - } - /* task_work already pending, we're done */ if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; @@ -1380,6 +1362,17 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) io_fallback_tw(tctx); } +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +{ + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + rcu_read_lock(); + io_req_local_work_add(req, flags); + rcu_read_unlock(); + } else { + io_req_normal_work_add(req); + } +} + static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { struct llist_node *node; @@ -1390,7 +1383,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) io_task_work.node); node = node->next; - __io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL); + io_req_normal_work_add(req); } } @@ -1405,13 +1398,19 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: - node = io_llist_xchg(&ctx->work_llist, NULL); + /* + * llists are in reverse order, flip it back the right way before + * running the pending items. + */ + node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, ts); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); ret++; node = next; } @@ -1498,9 +1497,6 @@ void io_queue_next(struct io_kiocb *req) void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { - struct task_struct *task = NULL; - int task_refs = 0; - do { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); @@ -1530,19 +1526,10 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) io_req_put_rsrc_locked(req, ctx); - if (req->task != task) { - if (task) - io_put_task(task, task_refs); - task = req->task; - task_refs = 0; - } - task_refs++; + io_put_task(req->task); node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); - - if (task) - io_put_task(task, task_refs); } static void __io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -1570,7 +1557,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } } } - __io_cq_unlock_post_flush(ctx); + __io_cq_unlock_post(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); @@ -1578,22 +1565,6 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } } -/* - * Drop reference to request, return next in chain (if there is one) if this - * was the last reference to this request. - */ -static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt = NULL; - - if (req_ref_put_and_test(req)) { - if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) - nxt = io_req_find_next(req); - io_free_req(req); - } - return nxt; -} - static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ @@ -1758,54 +1729,14 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) } } -static bool io_bdev_nowait(struct block_device *bdev) -{ - return !bdev || bdev_nowait(bdev); -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static bool __io_file_supports_nowait(struct file *file, umode_t mode) -{ - if (S_ISBLK(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(I_BDEV(file->f_mapping->host))) - return true; - return false; - } - if (S_ISSOCK(mode)) - return true; - if (S_ISREG(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(file->f_inode->i_sb->s_bdev) && - !io_is_uring_fops(file)) - return true; - return false; - } - - /* any ->read/write should understand O_NONBLOCK */ - if (file->f_flags & O_NONBLOCK) - return true; - return file->f_mode & FMODE_NOWAIT; -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ unsigned int io_file_get_flags(struct file *file) { - umode_t mode = file_inode(file)->i_mode; unsigned int res = 0; - if (S_ISREG(mode)) - res |= FFS_ISREG; - if (__io_file_supports_nowait(file, mode)) - res |= FFS_NOWAIT; + if (S_ISREG(file_inode(file)->i_mode)) + res |= REQ_F_ISREG; + if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) + res |= REQ_F_SUPPORT_NOWAIT; return res; } @@ -1891,39 +1822,6 @@ queue: spin_unlock(&ctx->completion_lock); } -static void io_clean_op(struct io_kiocb *req) -{ - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); - spin_unlock(&req->ctx->completion_lock); - } - - if (req->flags & REQ_F_NEED_CLEANUP) { - const struct io_cold_def *def = &io_cold_defs[req->opcode]; - - if (def->cleanup) - def->cleanup(req); - } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } - if (req->flags & REQ_F_CREDS) - put_cred(req->creds); - if (req->flags & REQ_F_ASYNC_DATA) { - kfree(req->async_data); - req->async_data = NULL; - } - req->flags &= ~IO_REQ_CLEAN_FLAGS; -} - static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, unsigned int issue_flags) { @@ -1986,9 +1884,14 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; - req = io_put_req_find_next(req); - return req ? &req->work : NULL; + if (req_ref_put_and_test(req)) { + if (req->flags & IO_REQ_LINK_FLAGS) + nxt = io_req_find_next(req); + io_free_req(req); + } + return nxt ? &nxt->work : NULL; } void io_wq_submit_work(struct io_wq_work *work) @@ -2060,19 +1963,17 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_fixed_file *slot; struct file *file = NULL; - unsigned long file_ptr; io_ring_submit_lock(ctx, issue_flags); if (unlikely((unsigned int)fd >= ctx->nr_user_files)) goto out; fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); + slot = io_fixed_file_slot(&ctx->file_table, fd); + file = io_slot_file(slot); + req->flags |= io_slot_flags(slot); io_req_set_rsrc_node(req, ctx, 0); out: io_ring_submit_unlock(ctx, issue_flags); @@ -2709,11 +2610,96 @@ static void io_mem_free(void *ptr) free_compound_page(page); } +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array; + int i; + + if (!pages) + return; + page_array = *pages; + for (i = 0; i < npages; i++) + unpin_user_page(page_array[i]); + kvfree(page_array); + *pages = NULL; +} + +static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size) +{ + struct page **page_array; + unsigned int nr_pages; + int ret; + + *npages = 0; + + if (uaddr & (PAGE_SIZE - 1) || !size) + return ERR_PTR(-EINVAL); + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages > USHRT_MAX) + return ERR_PTR(-EINVAL); + page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!page_array) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + page_array); + if (ret != nr_pages) { +err: + io_pages_free(&page_array, ret > 0 ? ret : 0); + return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); + } + /* + * Should be a single page. If the ring is small enough that we can + * use a normal page, that is fine. If we need multiple pages, then + * userspace should use a huge page. That's the only way to guarantee + * that we get contigious memory, outside of just being lucky or + * (currently) having low memory fragmentation. + */ + if (page_array[0] != page_array[ret - 1]) + goto err; + *pages = page_array; + *npages = nr_pages; + return page_to_virt(page_array[0]); +} + +static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, + size_t size) +{ + return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, + size); +} + +static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, + size_t size) +{ + return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, + size); +} + +static void io_rings_free(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { + io_mem_free(ctx->rings); + io_mem_free(ctx->sq_sqes); + ctx->rings = NULL; + ctx->sq_sqes = NULL; + } else { + io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); + io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); + } +} + static void *io_mem_alloc(size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + void *ret; - return (void *) __get_free_pages(gfp, get_order(size)); + ret = (void *) __get_free_pages(gfp, get_order(size)); + if (ret) + return ret; + return ERR_PTR(-ENOMEM); } static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, @@ -2869,8 +2855,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mmdrop(ctx->mm_account); ctx->mm_account = NULL; } - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_rings_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -3050,7 +3035,18 @@ static __cold void io_ring_exit_work(struct work_struct *work) /* there is little hope left, don't run it too often */ interval = HZ * 60; } - } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); + /* + * This is really an uninterruptible wait, as it has to be + * complete. But it's also run from a kworker, which doesn't + * take signals, so it's fine to make it interruptible. This + * avoids scenarios where we knowingly can wait much longer + * on completions, for example if someone does a SIGSTOP on + * a task that needs to finish task_work to make this loop + * complete. That's a synthetic situation that should not + * cause a stuck task backtrace, and hence a potential panic + * on stuck tasks if that is enabled. + */ + } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); @@ -3074,7 +3070,12 @@ static __cold void io_ring_exit_work(struct work_struct *work) continue; mutex_unlock(&ctx->uring_lock); - wait_for_completion(&exit.completion); + /* + * See comment above for + * wait_for_completion_interruptible_timeout() on why this + * wait is marked as interruptible. + */ + wait_for_completion_interruptible(&exit.completion); mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); @@ -3348,6 +3349,10 @@ static void *io_uring_validate_mmap_request(struct file *file, struct page *page; void *ptr; + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: @@ -3673,6 +3678,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, { struct io_rings *rings; size_t size, sq_array_offset; + void *ptr; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3682,9 +3688,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - rings = io_mem_alloc(size); - if (!rings) - return -ENOMEM; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + rings = io_mem_alloc(size); + else + rings = io_rings_map(ctx, p->cq_off.user_addr, size); + + if (IS_ERR(rings)) + return PTR_ERR(rings); ctx->rings = rings; ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); @@ -3698,34 +3708,31 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, else size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { - io_mem_free(ctx->rings); - ctx->rings = NULL; + io_rings_free(ctx); return -EOVERFLOW; } - ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -ENOMEM; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + ptr = io_mem_alloc(size); + else + ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); + + if (IS_ERR(ptr)) { + io_rings_free(ctx); + return PTR_ERR(ptr); } + ctx->sq_sqes = ptr; return 0; } -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_install_fd(struct file *file) { - int ret, fd; + int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; - - ret = __io_uring_add_tctx_node(ctx); - if (ret) { - put_unused_fd(fd); - return ret; - } fd_install(fd, file); return fd; } @@ -3765,6 +3772,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { struct io_ring_ctx *ctx; + struct io_uring_task *tctx; struct file *file; int ret; @@ -3776,6 +3784,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, entries = IORING_MAX_ENTRIES; } + if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + && !(p->flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -3887,7 +3899,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); @@ -3895,8 +3906,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->sq_off.flags = offsetof(struct io_rings, sq_flags); p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; + p->sq_off.resv1 = 0; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; - memset(&p->cq_off, 0, sizeof(p->cq_off)); p->cq_off.head = offsetof(struct io_rings, cq.head); p->cq_off.tail = offsetof(struct io_rings, cq.tail); p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); @@ -3904,6 +3917,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); p->cq_off.flags = offsetof(struct io_rings, cq_flags); + p->cq_off.resv1 = 0; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | @@ -3928,22 +3944,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } + ret = __io_uring_add_tctx_node(ctx); + if (ret) + goto err_fput; + tctx = current->io_uring; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ - ret = io_uring_install_fd(ctx, file); - if (ret < 0) { - /* fput will clean it up */ - fput(file); - return ret; - } + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); + else + ret = io_uring_install_fd(file); + if (ret < 0) + goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; +err_fput: + fput(file); + return ret; } /* @@ -3969,7 +3993,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN)) + IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | + IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 259bf798a390..d3606d30cf6f 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -16,9 +16,6 @@ #endif enum { - /* don't use deferred task_work */ - IOU_F_TWQ_FORCE_NORMAL = 1, - /* * A hint to not wake right away but delay until there are enough of * tw's queued to match the number of CQEs the task is waiting for. @@ -26,7 +23,7 @@ enum { * Must not be used wirh requests generating more than one CQE. * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. */ - IOU_F_TWQ_LAZY_WAKE = 2, + IOU_F_TWQ_LAZY_WAKE = 1, }; enum { @@ -47,7 +44,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, +bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags, bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); @@ -57,11 +54,6 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -static inline bool io_req_ffs_set(struct io_kiocb *req) -{ - return req->flags & REQ_F_FIXED_FILE; -} - void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); @@ -75,6 +67,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, + int start, int end); + int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); @@ -115,8 +110,6 @@ static inline void io_req_task_work_add(struct io_kiocb *req) #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) -void io_cq_unlock_post(struct io_ring_ctx *ctx); - static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, bool overflow) { diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 85fd7ce5f05b..cd6dcf634ba3 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -162,14 +162,12 @@ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_fl struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; struct file *file = NULL; - unsigned long file_ptr; int idx = msg->src_fd; io_ring_submit_lock(ctx, issue_flags); if (likely(idx < ctx->nr_user_files)) { idx = array_index_nospec(idx, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); + file = io_file_from_index(&ctx->file_table, idx); if (file) get_file(file); } diff --git a/io_uring/net.c b/io_uring/net.c index c8a4b2ac00f7..a8e303796f16 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -624,9 +624,15 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) * again (for multishot). */ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, - unsigned int cflags, bool mshot_finished, + struct msghdr *msg, bool mshot_finished, unsigned issue_flags) { + unsigned int cflags; + + cflags = io_put_kbuf(req, issue_flags); + if (msg->msg_inq && msg->msg_inq != -1U) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { io_req_set_res(req, *ret, cflags); *ret = IOU_OK; @@ -634,10 +640,18 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, } if (!mshot_finished) { - if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER, - req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) { + if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, + *ret, cflags | IORING_CQE_F_MORE, true)) { io_recv_prep_retry(req); - return false; + /* Known not-empty or unknown state, retry */ + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || + msg->msg_inq == -1U) + return false; + if (issue_flags & IO_URING_F_MULTISHOT) + *ret = IOU_ISSUE_SKIP_COMPLETE; + else + *ret = -EAGAIN; + return true; } /* Otherwise stop multishot but use the current result. */ } @@ -740,7 +754,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -791,6 +804,7 @@ retry_multishot: flags |= MSG_DONTWAIT; kmsg->msg.msg_get_inq = 1; + kmsg->msg.msg_inq = -1U; if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_multishot(sock, sr, kmsg, flags, &mshot_finished); @@ -831,11 +845,7 @@ retry_multishot: else io_kbuf_recycle(req, issue_flags); - cflags = io_put_kbuf(req, issue_flags); - if (kmsg->msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - - if (!io_recv_finish(req, &ret, cflags, mshot_finished, issue_flags)) + if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) goto retry_multishot; if (mshot_finished) { @@ -854,7 +864,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; struct socket *sock; - unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -871,6 +880,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_get_inq = 1; + msg.msg_controllen = 0; + msg.msg_iocb = NULL; + msg.msg_ubuf = NULL; + retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; @@ -885,14 +902,8 @@ retry_multishot: if (unlikely(ret)) goto out_free; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; + msg.msg_inq = -1U; msg.msg_flags = 0; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - msg.msg_ubuf = NULL; flags = sr->msg_flags; if (force_nonblock) @@ -932,11 +943,7 @@ out_free: else io_kbuf_recycle(req, issue_flags); - cflags = io_put_kbuf(req, issue_flags); - if (msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - - if (!io_recv_finish(req, &ret, cflags, ret <= 0, issue_flags)) + if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) goto retry_multishot; return ret; @@ -1308,7 +1315,6 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_accept(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -1358,8 +1364,8 @@ retry: if (ret < 0) return ret; - if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER, - req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) + if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret, + IORING_CQE_F_MORE, true)) goto retry; return -ECANCELED; diff --git a/io_uring/poll.c b/io_uring/poll.c index a78b8af7d9ab..d4597efe14a7 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -300,8 +300,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_aux_cqe(req->ctx, ts->locked, req->cqe.user_data, - mask, IORING_CQE_F_MORE, false)) { + if (!io_aux_cqe(req, ts->locked, mask, + IORING_CQE_F_MORE, false)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } @@ -326,7 +326,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_NO_ACTION; } -static void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) +void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) { int ret; diff --git a/io_uring/poll.h b/io_uring/poll.h index b2393b403a2c..ff4d5d753387 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -38,3 +38,5 @@ bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); void io_apoll_cache_free(struct io_cache_entry *entry); + +void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d46f72a5ef73..a2dce7ef3a78 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -354,7 +354,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, __s32 __user *fds = u64_to_user_ptr(up->data); struct io_rsrc_data *data = ctx->file_data; struct io_fixed_file *file_slot; - struct file *file; int fd, i, err = 0; unsigned int done; @@ -382,15 +381,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, file_slot = io_fixed_file_slot(&ctx->file_table, i); if (file_slot->file_ptr) { - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, i, file); + err = io_queue_rsrc_removal(data, i, + io_slot_file(file_slot)); if (err) break; file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, i); } if (fd != -1) { - file = fget(fd); + struct file *file = fget(fd); + if (!file) { err = -EBADF; break; diff --git a/io_uring/rw.c b/io_uring/rw.c index 3f118ed46e4f..1bce2208b65c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -283,7 +283,7 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -static void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) { io_req_io_end(req); @@ -666,8 +666,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) if (unlikely(!file || !(file->f_mode & mode))) return -EBADF; - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; + if (!(req->flags & REQ_F_FIXED_FILE)) + req->flags |= io_file_get_flags(file); kiocb->ki_flags = file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags); diff --git a/io_uring/rw.h b/io_uring/rw.h index 3b733f4b610a..4b89f9659366 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -22,3 +22,4 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags); int io_writev_prep_async(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); +void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 3a8d1dd97e1b..c043fe93a3f2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -208,31 +208,40 @@ void io_uring_unreg_ringfd(void) } } -static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end) { - struct file *file; int offset; - for (offset = start; offset < end; offset++) { offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); if (tctx->registered_rings[offset]) continue; - file = fget(fd); - if (!file) { - return -EBADF; - } else if (!io_is_uring_fops(file)) { - fput(file); - return -EOPNOTSUPP; - } tctx->registered_rings[offset] = file; return offset; } - return -EBUSY; } +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, + int start, int end) +{ + struct file *file; + int offset; + + file = fget(fd); + if (!file) { + return -EBADF; + } else if (!io_is_uring_fops(file)) { + fput(file); + return -EOPNOTSUPP; + } + offset = io_ring_add_registered_file(tctx, file, start, end); + if (offset < 0) + fput(file); + return offset; +} + /* * Register a ring fd to avoid fdget/fdput for each io_uring_enter() * invocation. User passes in an array of struct io_uring_rsrc_update diff --git a/io_uring/timeout.c b/io_uring/timeout.c index fc950177e2e1..fb0547b35dcd 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -73,8 +73,8 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) if (!io_timeout_finish(timeout, data)) { bool filled; - filled = io_aux_cqe(ctx, ts->locked, req->cqe.user_data, -ETIME, - IORING_CQE_F_MORE, false); + filled = io_aux_cqe(req, ts->locked, -ETIME, IORING_CQE_F_MORE, + false); if (filled) { /* re-arm timer */ spin_lock_irq(&ctx->timeout_lock); @@ -594,7 +594,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) goto add; } - tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + tail = data_race(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts); timeout->target_seq = tail + off; /* Update the last seq here in case io_flush_timeouts() hasn't. diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 5e32db48696d..476c7877ce58 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -20,16 +20,24 @@ static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) ioucmd->task_work_cb(ioucmd, issue_flags); } -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); ioucmd->task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; - io_req_task_work_add(req); + __io_req_task_work_add(req, flags); +} +EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); + +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } -EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); +EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy); static inline void io_req_set_cqe32_extra(struct io_kiocb *req, u64 extra1, u64 extra2) diff --git a/net/socket.c b/net/socket.c index 401778380195..e46b162f1182 100644 --- a/net/socket.c +++ b/net/socket.c @@ -471,6 +471,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) return file; } + file->f_mode |= FMODE_NOWAIT; sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); |