diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 1234 |
1 files changed, 763 insertions, 471 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index 9b1833fedc5c..187dd94fd6b1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -289,7 +289,10 @@ struct io_ring_ctx { */ struct io_poll_iocb { struct file *file; - struct wait_queue_head *head; + union { + struct wait_queue_head *head; + u64 addr; + }; __poll_t events; bool done; bool canceled; @@ -304,6 +307,51 @@ struct io_timeout_data { u32 seq_offset; }; +struct io_accept { + struct file *file; + struct sockaddr __user *addr; + int __user *addr_len; + int flags; +}; + +struct io_sync { + struct file *file; + loff_t len; + loff_t off; + int flags; +}; + +struct io_cancel { + struct file *file; + u64 addr; +}; + +struct io_timeout { + struct file *file; + u64 addr; + int flags; + unsigned count; +}; + +struct io_rw { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct kiocb kiocb; + u64 addr; + u64 len; +}; + +struct io_connect { + struct file *file; + struct sockaddr __user *addr; + int addr_len; +}; + +struct io_sr_msg { + struct file *file; + struct user_msghdr __user *msg; + int msg_flags; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -323,7 +371,6 @@ struct io_async_rw { }; struct io_async_ctx { - struct io_uring_sqe sqe; union { struct io_async_rw rw; struct io_async_msghdr msg; @@ -341,17 +388,23 @@ struct io_async_ctx { struct io_kiocb { union { struct file *file; - struct kiocb rw; + struct io_rw rw; struct io_poll_iocb poll; + struct io_accept accept; + struct io_sync sync; + struct io_cancel cancel; + struct io_timeout timeout; + struct io_connect connect; + struct io_sr_msg sr_msg; }; - const struct io_uring_sqe *sqe; struct io_async_ctx *io; struct file *ring_file; int ring_fd; bool has_user; bool in_async; bool needs_fixed_file; + u8 opcode; struct io_ring_ctx *ctx; union { @@ -564,12 +617,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +static inline bool io_req_needs_user(struct io_kiocb *req) { - u8 opcode = READ_ONCE(sqe->opcode); - - return !(opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED); + return !(req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED); } static inline bool io_prep_async_work(struct io_kiocb *req, @@ -577,33 +628,31 @@ static inline bool io_prep_async_work(struct io_kiocb *req, { bool do_hashed = false; - if (req->sqe) { - switch (req->sqe->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - /* only regular files should be hashed for writes */ - if (req->flags & REQ_F_ISREG) - do_hashed = true; - /* fall-through */ - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - case IORING_OP_ACCEPT: - case IORING_OP_POLL_ADD: - case IORING_OP_CONNECT: - /* - * We know REQ_F_ISREG is not set on some of these - * opcodes, but this enables us to keep the check in - * just one place. - */ - if (!(req->flags & REQ_F_ISREG)) - req->work.flags |= IO_WQ_WORK_UNBOUND; - break; - } - if (io_sqe_needs_user(req->sqe)) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + switch (req->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + /* only regular files should be hashed for writes */ + if (req->flags & REQ_F_ISREG) + do_hashed = true; + /* fall-through */ + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_SENDMSG: + case IORING_OP_RECVMSG: + case IORING_OP_ACCEPT: + case IORING_OP_POLL_ADD: + case IORING_OP_CONNECT: + /* + * We know REQ_F_ISREG is not set on some of these + * opcodes, but this enables us to keep the check in + * just one place. + */ + if (!(req->flags & REQ_F_ISREG)) + req->work.flags |= IO_WQ_WORK_UNBOUND; + break; } + if (io_req_needs_user(req)) + req->work.flags |= IO_WQ_WORK_NEEDS_USER; *link = io_prep_linked_timeout(req); return do_hashed; @@ -972,7 +1021,7 @@ static void io_fail_links(struct io_kiocb *req) trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { + link->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); @@ -1148,7 +1197,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, ret = 0; list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; /* * Move completed entries to our local list. If we find a @@ -1178,7 +1227,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, } /* - * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a + * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a * non-spinning poll check - we'll still enter the driver poll loop, but only * as a non-spinning completion check. */ @@ -1303,7 +1352,7 @@ static inline void req_set_fail_links(struct io_kiocb *req) static void io_complete_rw_common(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -1315,7 +1364,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); io_complete_rw_common(kiocb, res); io_put_req(req); @@ -1323,7 +1372,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_kiocb *nxt = NULL; io_complete_rw_common(kiocb, res); @@ -1334,7 +1383,7 @@ static struct io_kiocb *__io_complete_rw(struct kiocb *kiocb, long res) static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -1368,7 +1417,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, list); - if (list_req->rw.ki_filp != req->rw.ki_filp) + if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -1439,11 +1488,11 @@ static bool io_file_supports_async(struct file *file) return false; } -static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { - const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; unsigned ioprio; int ret; @@ -1492,6 +1541,12 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) return -EINVAL; kiocb->ki_complete = io_complete_rw; } + + req->rw.addr = READ_ONCE(sqe->addr); + req->rw.len = READ_ONCE(sqe->len); + /* we own ->private, reuse it for the buffer index */ + req->rw.kiocb.private = (void *) (unsigned long) + READ_ONCE(sqe->buf_index); return 0; } @@ -1525,11 +1580,11 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, io_rw_done(kiocb, ret); } -static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, - const struct io_uring_sqe *sqe, +static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) { - size_t len = READ_ONCE(sqe->len); + struct io_ring_ctx *ctx = req->ctx; + size_t len = req->rw.len; struct io_mapped_ubuf *imu; unsigned index, buf_index; size_t offset; @@ -1539,13 +1594,13 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, if (unlikely(!ctx->user_bufs)) return -EFAULT; - buf_index = READ_ONCE(sqe->buf_index); + buf_index = (unsigned long) req->rw.kiocb.private; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = &ctx->user_bufs[index]; - buf_addr = READ_ONCE(sqe->addr); + buf_addr = req->rw.addr; /* overflow */ if (buf_addr + len < buf_addr) @@ -1602,25 +1657,20 @@ static ssize_t io_import_fixed(struct io_ring_ctx *ctx, int rw, static ssize_t io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, struct iov_iter *iter) { - const struct io_uring_sqe *sqe = req->sqe; - void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - size_t sqe_len = READ_ONCE(sqe->len); + void __user *buf = u64_to_user_ptr(req->rw.addr); + size_t sqe_len = req->rw.len; u8 opcode; - /* - * We're reading ->opcode for the second time, but the first read - * doesn't care whether it's _FIXED or not, so it doesn't matter - * whether ->opcode changes concurrently. The first read does care - * about whether it is a READ or a WRITE, so we don't trust this read - * for that purpose and instead let the caller pass in the read/write - * flag. - */ - opcode = READ_ONCE(sqe->opcode); + opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; - return io_import_fixed(req->ctx, rw, sqe, iter); + return io_import_fixed(req, rw, iter); } + /* buffer index only valid with fixed read/write */ + if (req->rw.kiocb.private) + return -EINVAL; + if (req->io) { struct io_async_rw *iorw = &req->io->rw; @@ -1701,7 +1751,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } -static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, +static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { @@ -1715,57 +1765,85 @@ static void io_req_map_io(struct io_kiocb *req, ssize_t io_size, } } -static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size, +static int io_alloc_async_ctx(struct io_kiocb *req) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + return req->io == NULL; +} + +static void io_rw_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; + + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->rw.iov; + io_wq_submit_work(workptr); + kfree(iov); +} + +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); - if (req->io) { - io_req_map_io(req, io_size, iovec, fast_iov, iter); - memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); - req->sqe = &req->io->sqe; + if (req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED) return 0; - } + if (!req->io && io_alloc_async_ctx(req)) + return -ENOMEM; - return -ENOMEM; + io_req_map_rw(req, io_size, iovec, fast_iov, iter); + req->work.func = io_rw_async; + return 0; } -static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool force_nonblock) +static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { + struct io_async_ctx *io; + struct iov_iter iter; ssize_t ret; - ret = io_prep_rw(req, force_nonblock); + ret = io_prep_rw(req, sqe, force_nonblock); if (ret) return ret; if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - return io_import_iovec(READ, req, iovec, iter); + if (!req->io) + return 0; + + io = req->io; + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(READ, req, &io->rw.iov, &iter); + req->io = io; + if (ret < 0) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; } static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; - struct file *file; size_t iov_count; ssize_t io_size, ret; - if (!req->io) { - ret = io_read_prep(req, &iovec, &iter, force_nonblock); - if (ret < 0) - return ret; - } else { - ret = io_import_iovec(READ, req, &iovec, &iter); - if (ret < 0) - return ret; - } + ret = io_import_iovec(READ, req, &iovec, &iter); + if (ret < 0) + return ret; - file = req->file; + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + + req->result = 0; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1774,39 +1852,27 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(file)) { + if (force_nonblock && !io_file_supports_async(req->file)) { req->flags |= REQ_F_MUST_PUNT; goto copy_iov; } iov_count = iov_iter_count(&iter); - ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; - if (file->f_op->read_iter) - ret2 = call_read_iter(file, kiocb, &iter); + if (req->file->f_op->read_iter) + ret2 = call_read_iter(req->file, kiocb, &iter); else - ret2 = loop_rw_iter(READ, file, kiocb, &iter); + ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); - /* - * In case of a short read, punt to async. This can happen - * if we have data partially cached. Alternatively we can - * return the short read, in which case the application will - * need to issue another SQE and wait for it. That SQE will - * need async punt anyway, so it's more efficient to do it - * here. - */ - if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && - (req->flags & REQ_F_ISREG) && - ret2 > 0 && ret2 < io_size) - ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); } else { copy_iov: - ret = io_setup_async_io(req, io_size, iovec, + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; @@ -1814,46 +1880,58 @@ copy_iov: } } out_free: - kfree(iovec); + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } -static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool force_nonblock) +static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) { + struct io_async_ctx *io; + struct iov_iter iter; ssize_t ret; - ret = io_prep_rw(req, force_nonblock); + ret = io_prep_rw(req, sqe, force_nonblock); if (ret) return ret; if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - return io_import_iovec(WRITE, req, iovec, iter); + if (!req->io) + return 0; + + io = req->io; + io->rw.iov = io->rw.fast_iov; + req->io = NULL; + ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter); + req->io = io; + if (ret < 0) + return ret; + + io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter); + return 0; } static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct kiocb *kiocb = &req->rw; + struct kiocb *kiocb = &req->rw.kiocb; struct iov_iter iter; - struct file *file; size_t iov_count; ssize_t ret, io_size; - if (!req->io) { - ret = io_write_prep(req, &iovec, &iter, force_nonblock); - if (ret < 0) - return ret; - } else { - ret = io_import_iovec(WRITE, req, &iovec, &iter); - if (ret < 0) - return ret; - } + ret = io_import_iovec(WRITE, req, &iovec, &iter); + if (ret < 0) + return ret; - file = kiocb->ki_filp; + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT; + + req->result = 0; io_size = ret; if (req->flags & REQ_F_LINK) req->result = io_size; @@ -1873,7 +1951,7 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, goto copy_iov; iov_count = iov_iter_count(&iter); - ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1885,22 +1963,22 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, * we return to userspace. */ if (req->flags & REQ_F_ISREG) { - __sb_start_write(file_inode(file)->i_sb, + __sb_start_write(file_inode(req->file)->i_sb, SB_FREEZE_WRITE, true); - __sb_writers_release(file_inode(file)->i_sb, + __sb_writers_release(file_inode(req->file)->i_sb, SB_FREEZE_WRITE); } kiocb->ki_flags |= IOCB_WRITE; - if (file->f_op->write_iter) - ret2 = call_write_iter(file, kiocb, &iter); + if (req->file->f_op->write_iter) + ret2 = call_write_iter(req->file, kiocb, &iter); else - ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); + ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); } else { copy_iov: - ret = io_setup_async_io(req, io_size, iovec, + ret = io_setup_async_rw(req, io_size, iovec, inline_vecs, &iter); if (ret) goto out_free; @@ -1908,7 +1986,8 @@ copy_iov: } } out_free: - kfree(iovec); + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } @@ -1939,45 +2018,92 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; + req->sync.flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); return 0; } -static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static bool io_req_cancelled(struct io_kiocb *req) { - loff_t sqe_off = READ_ONCE(sqe->off); - loff_t sqe_len = READ_ONCE(sqe->len); - loff_t end = sqe_off + sqe_len; - unsigned fsync_flags; - int ret; + if (req->work.flags & IO_WQ_WORK_CANCEL) { + req_set_fail_links(req); + io_cqring_add_event(req, -ECANCELED); + io_put_req(req); + return true; + } - fsync_flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) - return -EINVAL; + return false; +} - ret = io_prep_fsync(req, sqe); - if (ret) - return ret; +static void io_link_work_cb(struct io_wq_work **workptr) +{ + struct io_wq_work *work = *workptr; + struct io_kiocb *link = work->data; - /* fsync always requires a blocking context */ - if (force_nonblock) - return -EAGAIN; + io_queue_linked_timeout(link); + work->func = io_wq_submit_work; +} - ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, - end > 0 ? end : LLONG_MAX, - fsync_flags & IORING_FSYNC_DATASYNC); +static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) +{ + struct io_kiocb *link; + + io_prep_async_work(nxt, &link); + *workptr = &nxt->work; + if (link) { + nxt->work.flags |= IO_WQ_WORK_CB; + nxt->work.func = io_link_work_cb; + nxt->work.data = link; + } +} + +static void io_fsync_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + loff_t end = req->sync.off + req->sync.len; + struct io_kiocb *nxt = NULL; + int ret; + + if (io_req_cancelled(req)) + return; + ret = vfs_fsync_range(req->file, req->sync.off, + end > 0 ? end : LLONG_MAX, + req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + + /* fsync always requires a blocking context */ + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fsync_finish; + return -EAGAIN; + } + + work = old_work = &req->work; + io_fsync_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); return 0; } static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - int ret = 0; if (!req->file) return -EBADF; @@ -1987,60 +2113,88 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - return ret; + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); + req->sync.flags = READ_ONCE(sqe->sync_range_flags); + return 0; } -static int io_sync_file_range(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, - bool force_nonblock) +static void io_sync_file_range_finish(struct io_wq_work **workptr) { - loff_t sqe_off; - loff_t sqe_len; - unsigned flags; + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; int ret; - ret = io_prep_sfr(req, sqe); - if (ret) - return ret; + if (io_req_cancelled(req)) + return; + + ret = sync_file_range(req->file, req->sync.off, req->sync.len, + req->sync.flags); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; /* sync_file_range always requires a blocking context */ - if (force_nonblock) + if (force_nonblock) { + io_put_req(req); + req->work.func = io_sync_file_range_finish; return -EAGAIN; + } - sqe_off = READ_ONCE(sqe->off); - sqe_len = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->sync_range_flags); + work = old_work = &req->work; + io_sync_file_range_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; +} - ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); +#if defined(CONFIG_NET) +static void io_sendrecv_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); - return 0; + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->msg.iov; + io_wq_submit_work(workptr); + kfree(iov); } +#endif -static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - struct user_msghdr __user *msg; - unsigned flags; + struct io_sr_msg *sr = &req->sr_msg; + struct io_async_ctx *io = req->io; + + sr->msg_flags = READ_ONCE(sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + if (!io) + return 0; - flags = READ_ONCE(sqe->msg_flags); - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); io->msg.iov = io->msg.fast_iov; - return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); + return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + &io->msg.iov); #else - return 0; + return -EOPNOTSUPP; #endif } -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) + struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2049,46 +2203,52 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct io_async_ctx io, *copy; + struct io_async_ctx io; struct sockaddr_storage addr; - struct msghdr *kmsg; unsigned flags; - flags = READ_ONCE(sqe->msg_flags); - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - if (req->io) { - kmsg = &req->io->msg.msg; - kmsg->msg_name = &addr; + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg.msg; - kmsg->msg_name = &addr; - ret = io_sendmsg_prep(req, &io); + struct io_sr_msg *sr = &req->sr_msg; + + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; + + io.msg.iov = io.msg.fast_iov; + ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, + sr->msg_flags, &io.msg.iov); if (ret) - goto out; + return ret; } - ret = __sys_sendmsg_sock(sock, kmsg, flags); + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (force_nonblock && ret == -EAGAIN) { - copy = kmalloc(sizeof(*copy), GFP_KERNEL); - if (!copy) { - ret = -ENOMEM; - goto out; - } - memcpy(©->msg, &io.msg, sizeof(copy->msg)); - req->io = copy; - memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &req->io->sqe; - return ret; + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } -out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); @@ -2099,27 +2259,32 @@ out: #endif } -static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_recvmsg_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - struct user_msghdr __user *msg; - unsigned flags; + struct io_sr_msg *sr = &req->sr_msg; + struct io_async_ctx *io = req->io; + + sr->msg_flags = READ_ONCE(sqe->msg_flags); + sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + if (!io) + return 0; - flags = READ_ONCE(sqe->msg_flags); - msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); io->msg.iov = io->msg.fast_iov; - return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, - &io->msg.iov); + return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + &io->msg.uaddr, &io->msg.iov); #else - return 0; + return -EOPNOTSUPP; #endif } -static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) + struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -2128,49 +2293,54 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; - struct io_async_ctx io, *copy; + struct io_async_ctx io; struct sockaddr_storage addr; - struct msghdr *kmsg; unsigned flags; - flags = READ_ONCE(sqe->msg_flags); - if (flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - else if (force_nonblock) - flags |= MSG_DONTWAIT; - - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); if (req->io) { - kmsg = &req->io->msg.msg; - kmsg->msg_name = &addr; + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; } else { - kmsg = &io.msg.msg; - kmsg->msg_name = &addr; - ret = io_recvmsg_prep(req, &io); + struct io_sr_msg *sr = &req->sr_msg; + + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; + + io.msg.iov = io.msg.fast_iov; + ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, + sr->msg_flags, &io.msg.uaddr, + &io.msg.iov); if (ret) - goto out; + return ret; } - ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags); + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, + kmsg->uaddr, flags); if (force_nonblock && ret == -EAGAIN) { - copy = kmalloc(sizeof(*copy), GFP_KERNEL); - if (!copy) { - ret = -ENOMEM; - goto out; - } - memcpy(copy, &io, sizeof(*copy)); - req->io = copy; - memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &req->io->sqe; - return ret; + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } -out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); @@ -2181,30 +2351,38 @@ out: #endif } -static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - struct sockaddr __user *addr; - int __user *addr_len; - unsigned file_flags; - int flags, ret; + struct io_accept *accept = &req->accept; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); - addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); - flags = READ_ONCE(sqe->accept_flags); - file_flags = force_nonblock ? O_NONBLOCK : 0; + accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + accept->flags = READ_ONCE(sqe->accept_flags); + return 0; +#else + return -EOPNOTSUPP; +#endif +} - ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags); - if (ret == -EAGAIN && force_nonblock) { - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; +#if defined(CONFIG_NET) +static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_accept *accept = &req->accept; + unsigned file_flags; + int ret; + + file_flags = force_nonblock ? O_NONBLOCK : 0; + ret = __sys_accept4_file(req->file, file_flags, accept->addr, + accept->addr_len, accept->flags); + if (ret == -EAGAIN && force_nonblock) return -EAGAIN; - } if (ret == -ERESTARTSYS) ret = -EINTR; if (ret < 0) @@ -2212,63 +2390,95 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; -#else - return -EOPNOTSUPP; -#endif } -static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) +static void io_accept_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + if (io_req_cancelled(req)) + return; + __io_accept(req, &nxt, false); + if (nxt) + io_wq_assign_next(workptr, nxt); +} +#endif + +static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) - const struct io_uring_sqe *sqe = req->sqe; - struct sockaddr __user *addr; - int addr_len; + int ret; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); - addr_len = READ_ONCE(sqe->addr2); - return move_addr_to_kernel(addr, addr_len, &io->connect.address); -#else + ret = __io_accept(req, nxt, force_nonblock); + if (ret == -EAGAIN && force_nonblock) { + req->work.func = io_accept_finish; + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + io_put_req(req); + return -EAGAIN; + } return 0; +#else + return -EOPNOTSUPP; #endif } -static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) - struct io_async_ctx __io, *io; - unsigned file_flags; - int addr_len, ret; + struct io_connect *conn = &req->connect; + struct io_async_ctx *io = req->io; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; - addr_len = READ_ONCE(sqe->addr2); - file_flags = force_nonblock ? O_NONBLOCK : 0; + conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + conn->addr_len = READ_ONCE(sqe->addr2); + + if (!io) + return 0; + + return move_addr_to_kernel(conn->addr, conn->addr_len, + &io->connect.address); +#else + return -EOPNOTSUPP; +#endif +} + +static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct io_async_ctx __io, *io; + unsigned file_flags; + int ret; if (req->io) { io = req->io; } else { - ret = io_connect_prep(req, &__io); + ret = move_addr_to_kernel(req->connect.addr, + req->connect.addr_len, + &__io.connect.address); if (ret) goto out; io = &__io; } - ret = __sys_connect_file(req->file, &io->connect.address, addr_len, - file_flags); + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_connect_file(req->file, &io->connect.address, + req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) { ret = -ENOMEM; goto out; } - memcpy(&io->connect, &__io.connect, sizeof(io->connect)); - req->io = io; - memcpy(&io->sqe, req->sqe, sizeof(*req->sqe)); - req->sqe = &io->sqe; + memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); return -EAGAIN; } if (ret == -ERESTARTSYS) @@ -2331,23 +2541,32 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) return -ENOENT; } +static int io_poll_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + req->poll.addr = READ_ONCE(sqe->addr); + return 0; +} + /* * Find a running poll command that matches one specified in sqe->addr, * and remove it if found. */ -static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + u64 addr; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->poll_events) - return -EINVAL; - + addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req, ret); @@ -2413,7 +2632,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) req_set_fail_links(req); io_put_req_find_next(req, &nxt); if (nxt) - *workptr = &nxt->work; + io_wq_assign_next(workptr, nxt); } static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -2482,14 +2701,9 @@ static void io_poll_req_insert(struct io_kiocb *req) hlist_add_head(&req->hash_node, list); } -static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; - struct io_poll_table ipt; - bool cancel = false; - __poll_t mask; u16 events; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -2499,10 +2713,20 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - req->io = NULL; - INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + return 0; +} + +static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_poll_iocb *poll = &req->poll; + struct io_ring_ctx *ctx = req->ctx; + struct io_poll_table ipt; + bool cancel = false; + __poll_t mask; + + INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; @@ -2573,7 +2797,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) /* * Adjust the reqs sequence before the current one because it - * will consume a slot in the cq_ring and the the cq_tail + * will consume a slot in the cq_ring and the cq_tail * pointer will be increased, otherwise other timeout reqs may * return in advance without waiting for enough wait_nr. */ @@ -2619,26 +2843,32 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return 0; } +static int io_timeout_remove_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + return -EINVAL; + + req->timeout.addr = READ_ONCE(sqe->addr); + req->timeout.flags = READ_ONCE(sqe->timeout_flags); + if (req->timeout.flags) + return -EINVAL; + + return 0; +} + /* * Remove or update an existing timeout command */ -static int io_timeout_remove(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_timeout_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; int ret; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags) - return -EINVAL; - spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_timeout_cancel(ctx, req->timeout.addr); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); @@ -2650,10 +2880,9 @@ static int io_timeout_remove(struct io_kiocb *req, return 0; } -static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, +static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool is_timeout_link) { - const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; unsigned flags; @@ -2667,7 +2896,12 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - data = &io->timeout; + req->timeout.count = READ_ONCE(sqe->off); + + if (!req->io && io_alloc_async_ctx(req)) + return -ENOMEM; + + data = &req->io->timeout; data->req = req; req->flags |= REQ_F_TIMEOUT; @@ -2680,32 +2914,17 @@ static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, data->mode = HRTIMER_MODE_REL; hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - req->io = io; return 0; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout(struct io_kiocb *req) { unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; - struct io_async_ctx *io; struct list_head *entry; unsigned span = 0; - io = req->io; - if (!io) { - int ret; - - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) - return -ENOMEM; - ret = io_timeout_prep(req, io, false); - if (ret) { - kfree(io); - return ret; - } - } data = &req->io->timeout; /* @@ -2713,7 +2932,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - count = READ_ONCE(sqe->off); + count = req->timeout.count; if (!count) { req->flags |= REQ_F_TIMEOUT_NOSEQ; spin_lock_irq(&ctx->completion_lock); @@ -2831,84 +3050,104 @@ done: io_put_req_find_next(req, nxt); } -static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_async_cancel_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; - - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); + req->cancel.addr = READ_ONCE(sqe->addr); return 0; } -static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io) +static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; - struct iov_iter iter; - ssize_t ret; + struct io_ring_ctx *ctx = req->ctx; - memcpy(&io->sqe, req->sqe, sizeof(io->sqe)); - req->sqe = &io->sqe; + io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); + return 0; +} + +static int io_req_defer_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + ssize_t ret = 0; - switch (io->sqe.opcode) { + switch (req->opcode) { + case IORING_OP_NOP: + break; case IORING_OP_READV: case IORING_OP_READ_FIXED: - ret = io_read_prep(req, &iovec, &iter, true); + ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - ret = io_write_prep(req, &iovec, &iter, true); + ret = io_write_prep(req, sqe, true); + break; + case IORING_OP_POLL_ADD: + ret = io_poll_add_prep(req, sqe); + break; + case IORING_OP_POLL_REMOVE: + ret = io_poll_remove_prep(req, sqe); + break; + case IORING_OP_FSYNC: + ret = io_prep_fsync(req, sqe); + break; + case IORING_OP_SYNC_FILE_RANGE: + ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: - ret = io_sendmsg_prep(req, io); + ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: - ret = io_recvmsg_prep(req, io); + ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: - ret = io_connect_prep(req, io); + ret = io_connect_prep(req, sqe); break; case IORING_OP_TIMEOUT: - return io_timeout_prep(req, io, false); + ret = io_timeout_prep(req, sqe, false); + break; + case IORING_OP_TIMEOUT_REMOVE: + ret = io_timeout_remove_prep(req, sqe); + break; + case IORING_OP_ASYNC_CANCEL: + ret = io_async_cancel_prep(req, sqe); + break; case IORING_OP_LINK_TIMEOUT: - return io_timeout_prep(req, io, true); + ret = io_timeout_prep(req, sqe, true); + break; + case IORING_OP_ACCEPT: + ret = io_accept_prep(req, sqe); + break; default: - req->io = io; - return 0; + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", + req->opcode); + ret = -EINVAL; + break; } - if (ret < 0) - return ret; - - req->io = io; - io_req_map_io(req, ret, iovec, inline_vecs, &iter); - return 0; + return ret; } -static int io_req_defer(struct io_kiocb *req) +static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - struct io_async_ctx *io; int ret; /* Still need defer if there is pending req in defer list. */ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) + if (!req->io && io_alloc_async_ctx(req)) return -EAGAIN; - ret = io_req_defer_prep(req, io); - if (ret < 0) { - kfree(io); + ret = io_req_defer_prep(req, sqe); + if (ret < 0) return ret; - } spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { @@ -2922,66 +3161,121 @@ static int io_req_defer(struct io_kiocb *req) return -EIOCBQUEUED; } -__attribute__((nonnull)) -static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, - bool force_nonblock) +static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_kiocb **nxt, bool force_nonblock) { - int ret, opcode; struct io_ring_ctx *ctx = req->ctx; + int ret; - opcode = READ_ONCE(req->sqe->opcode); - switch (opcode) { + switch (req->opcode) { case IORING_OP_NOP: ret = io_nop(req); break; case IORING_OP_READV: - if (unlikely(req->sqe->buf_index)) - return -EINVAL; - ret = io_read(req, nxt, force_nonblock); - break; - case IORING_OP_WRITEV: - if (unlikely(req->sqe->buf_index)) - return -EINVAL; - ret = io_write(req, nxt, force_nonblock); - break; case IORING_OP_READ_FIXED: + if (sqe) { + ret = io_read_prep(req, sqe, force_nonblock); + if (ret < 0) + break; + } ret = io_read(req, nxt, force_nonblock); break; + case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + if (sqe) { + ret = io_write_prep(req, sqe, force_nonblock); + if (ret < 0) + break; + } ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_prep_fsync(req, sqe); + if (ret < 0) + break; + } + ret = io_fsync(req, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, req->sqe, nxt); + if (sqe) { + ret = io_poll_add_prep(req, sqe); + if (ret) + break; + } + ret = io_poll_add(req, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, req->sqe); + if (sqe) { + ret = io_poll_remove_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_prep_sfr(req, sqe); + if (ret < 0) + break; + } + ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_sendmsg_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_sendmsg(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_recvmsg_prep(req, sqe); + if (ret) + break; + } + ret = io_recvmsg(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, req->sqe); + if (sqe) { + ret = io_timeout_prep(req, sqe, false); + if (ret) + break; + } + ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, req->sqe); + if (sqe) { + ret = io_timeout_remove_prep(req, sqe); + if (ret) + break; + } + ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: - ret = io_accept(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_accept_prep(req, sqe); + if (ret) + break; + } + ret = io_accept(req, nxt, force_nonblock); break; case IORING_OP_CONNECT: - ret = io_connect(req, req->sqe, nxt, force_nonblock); + if (sqe) { + ret = io_connect_prep(req, sqe); + if (ret) + break; + } + ret = io_connect(req, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, req->sqe, nxt); + if (sqe) { + ret = io_async_cancel_prep(req, sqe); + if (ret) + break; + } + ret = io_async_cancel(req, nxt); break; default: ret = -EINVAL; @@ -2992,24 +3286,24 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, return ret; if (ctx->flags & IORING_SETUP_IOPOLL) { + const bool in_async = io_wq_current_is_worker(); + if (req->result == -EAGAIN) return -EAGAIN; + /* workqueue context doesn't hold uring_lock, grab it now */ + if (in_async) + mutex_lock(&ctx->uring_lock); + io_iopoll_req_issued(req); + + if (in_async) + mutex_unlock(&ctx->uring_lock); } return 0; } -static void io_link_work_cb(struct io_wq_work **workptr) -{ - struct io_wq_work *work = *workptr; - struct io_kiocb *link = work->data; - - io_queue_linked_timeout(link); - work->func = io_wq_submit_work; -} - static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; @@ -3017,9 +3311,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - /* Ensure we clear previously set non-block flag */ - req->rw.ki_flags &= ~IOCB_NOWAIT; - if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; @@ -3027,7 +3318,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; req->in_async = true; do { - ret = io_issue_sqe(req, &nxt, false); + ret = io_issue_sqe(req, NULL, &nxt, false); /* * We can get EAGAIN for polled IO even though we're * forcing a sync submission from here, since we can't @@ -3049,17 +3340,8 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } /* if a dependent link is ready, pass it back */ - if (!ret && nxt) { - struct io_kiocb *link; - - io_prep_async_work(nxt, &link); - *workptr = &nxt->work; - if (link) { - nxt->work.flags |= IO_WQ_WORK_CB; - nxt->work.func = io_link_work_cb; - nxt->work.data = link; - } - } + if (!ret && nxt) + io_wq_assign_next(workptr, nxt); } static bool io_req_op_valid(int op) @@ -3067,11 +3349,9 @@ static bool io_req_op_valid(int op) return op >= IORING_OP_NOP && op < IORING_OP_LAST; } -static int io_op_needs_file(const struct io_uring_sqe *sqe) +static int io_req_needs_file(struct io_kiocb *req) { - int op = READ_ONCE(sqe->opcode); - - switch (op) { + switch (req->opcode) { case IORING_OP_NOP: case IORING_OP_POLL_REMOVE: case IORING_OP_TIMEOUT: @@ -3080,7 +3360,7 @@ static int io_op_needs_file(const struct io_uring_sqe *sqe) case IORING_OP_LINK_TIMEOUT: return 0; default: - if (io_req_op_valid(op)) + if (io_req_op_valid(req->opcode)) return 1; return -EINVAL; } @@ -3095,19 +3375,20 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, return table->files[index & IORING_FILE_TABLE_MASK]; } -static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) +static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; unsigned flags; int fd, ret; - flags = READ_ONCE(req->sqe->flags); - fd = READ_ONCE(req->sqe->fd); + flags = READ_ONCE(sqe->flags); + fd = READ_ONCE(sqe->fd); if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - ret = io_op_needs_file(req->sqe); + ret = io_req_needs_file(req); if (ret <= 0) return ret; @@ -3227,14 +3508,14 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, link_list); - if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) + if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; req->flags |= REQ_F_LINK_TIMEOUT; return nxt; } -static void __io_queue_sqe(struct io_kiocb *req) +static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; @@ -3243,7 +3524,7 @@ static void __io_queue_sqe(struct io_kiocb *req) again: linked_timeout = io_prep_linked_timeout(req); - ret = io_issue_sqe(req, &nxt, true); + ret = io_issue_sqe(req, sqe, &nxt, true); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -3290,7 +3571,7 @@ done_req: } } -static void io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; @@ -3300,7 +3581,7 @@ static void io_queue_sqe(struct io_kiocb *req) } req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - ret = io_req_defer(req); + ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); @@ -3308,7 +3589,7 @@ static void io_queue_sqe(struct io_kiocb *req) io_double_put_req(req); } } else - __io_queue_sqe(req); + __io_queue_sqe(req, sqe); } static inline void io_queue_link_head(struct io_kiocb *req) @@ -3317,27 +3598,25 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_cqring_add_event(req, -ECANCELED); io_double_put_req(req); } else - io_queue_sqe(req); + io_queue_sqe(req, NULL); } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK) -static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, - struct io_kiocb **link) +static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, + struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; int ret; - req->user_data = req->sqe->user_data; - /* enforce forwards compatibility on users */ - if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } - ret = io_req_set_file(state, req); + ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); @@ -3354,38 +3633,38 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; - struct io_async_ctx *io; - if (req->sqe->flags & IOSQE_IO_DRAIN) + if (sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (req->sqe->flags & IOSQE_IO_HARDLINK) + if (sqe->flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; - io = kmalloc(sizeof(*io), GFP_KERNEL); - if (!io) { + if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; } - ret = io_req_defer_prep(req, io); + ret = io_req_defer_prep(req, sqe); if (ret) { - kfree(io); /* fail even hard links since we don't submit */ prev->flags |= REQ_F_FAIL_LINK; goto err_req; } trace_io_uring_link(ctx, req, prev); list_add_tail(&req->link_list, &prev->link_list); - } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; - if (req->sqe->flags & IOSQE_IO_HARDLINK) + if (sqe->flags & IOSQE_IO_HARDLINK) req->flags |= REQ_F_HARDLINK; INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); + if (ret) + req->flags |= REQ_F_FAIL_LINK; *link = req; } else { - io_queue_sqe(req); + io_queue_sqe(req, sqe); } return true; @@ -3430,14 +3709,15 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } /* - * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe **sqe_ptr) { struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; @@ -3464,7 +3744,9 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) * link list. */ req->sequence = ctx->cached_sq_head; - req->sqe = &ctx->sq_sqes[head]; + *sqe_ptr = &ctx->sq_sqes[head]; + req->opcode = READ_ONCE((*sqe_ptr)->opcode); + req->user_data = READ_ONCE((*sqe_ptr)->user_data); ctx->cached_sq_head++; return true; } @@ -3496,6 +3778,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } for (i = 0; i < nr; i++) { + const struct io_uring_sqe *sqe; struct io_kiocb *req; unsigned int sqe_flags; @@ -3505,12 +3788,12 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, req)) { + if (!io_get_sqring(ctx, req, &sqe)) { __io_free_req(req); break; } - if (io_sqe_needs_user(req->sqe) && !*mm) { + if (io_req_needs_user(req) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3519,22 +3802,21 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } submitted++; - sqe_flags = req->sqe->flags; + sqe_flags = sqe->flags; req->ring_file = ring_file; req->ring_fd = ring_fd; req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->sqe->user_data, - true, async); - if (!io_submit_sqe(req, statep, &link)) + trace_io_uring_submit_sqe(ctx, req->user_data, true, async); + if (!io_submit_sqe(req, sqe, statep, &link)) break; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ - if (!(sqe_flags & IOSQE_IO_LINK) && link) { + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { io_queue_link_head(link); link = NULL; } @@ -3694,7 +3976,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) struct io_ring_ctx *ctx = iowq->ctx; /* - * Wake up if we have enough events, or if a timeout occured since we + * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ @@ -4446,7 +4728,7 @@ static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) return -EFAULT; - dst->iov_base = (void __user *) (unsigned long) ciov.iov_base; + dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); dst->iov_len = ciov.iov_len; return 0; } @@ -4877,6 +5159,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } else if (to_submit) { struct mm_struct *cur_mm; + if (current->mm != ctx->sqo_mm || + current_cred() != ctx->creds) { + ret = -EPERM; + goto out; + } + to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); /* already have mm, so io_submit_sqes() won't try to grab it */ @@ -4884,6 +5172,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, submitted = io_submit_sqes(ctx, to_submit, f.file, fd, &cur_mm, false); mutex_unlock(&ctx->uring_lock); + + if (submitted != to_submit) + goto out; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; @@ -4897,6 +5188,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } +out: percpu_ref_put(&ctx->refs); out_fput: fdput(f); |