diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 274 |
1 files changed, 137 insertions, 137 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index dc506b75659c..ce69bd9b0838 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -540,7 +540,6 @@ enum { REQ_F_ISREG_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, - REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, @@ -583,8 +582,6 @@ enum { REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* in overflow list */ - REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), /* already went through poll handler */ REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ @@ -946,7 +943,8 @@ static void io_get_req_task(struct io_kiocb *req) static inline void io_clean_op(struct io_kiocb *req) { - if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED)) + if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED | + REQ_F_INFLIGHT)) __io_clean_op(req); } @@ -1152,7 +1150,7 @@ static void io_prep_async_work(struct io_kiocb *req) io_req_init_async(req); if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file) + if (def->hash_reg_file || (req->ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else { if (def->unbound_nonreg_file) @@ -1366,7 +1364,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, compl.list); list_move(&req->compl.list, &list); - req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); @@ -1419,7 +1416,6 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } io_clean_op(req); - req->flags |= REQ_F_OVERFLOW; req->result = res; req->compl.cflags = cflags; refcount_inc(&req->refs); @@ -1563,17 +1559,6 @@ static bool io_dismantle_req(struct io_kiocb *req) if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->flags & REQ_F_INFLIGHT) { - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->inflight_lock, flags); - list_del(&req->inflight_entry); - if (waitqueue_active(&ctx->inflight_wait)) - wake_up(&ctx->inflight_wait); - spin_unlock_irqrestore(&ctx->inflight_lock, flags); - } - return io_req_clean_work(req); } @@ -1761,7 +1746,8 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb, + bool twa_signal_ok) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; @@ -1774,7 +1760,7 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) * will do the job. */ notify = 0; - if (!(ctx->flags & IORING_SETUP_SQPOLL)) + if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok) notify = TWA_SIGNAL; ret = task_work_add(tsk, cb, notify); @@ -1834,7 +1820,7 @@ static void io_req_task_queue(struct io_kiocb *req) init_task_work(&req->task_work, io_req_task_submit); percpu_ref_get(&req->ctx->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -2063,6 +2049,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, req = list_first_entry(done, struct io_kiocb, inflight_entry); if (READ_ONCE(req->result) == -EAGAIN) { + req->result = 0; req->iopoll_completed = 0; list_move_tail(&req->inflight_entry, &again); continue; @@ -2308,22 +2295,6 @@ end_req: io_req_complete(req, ret); return false; } - -static void io_rw_resubmit(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct io_ring_ctx *ctx = req->ctx; - int err; - - err = io_sq_thread_acquire_mm(ctx, req); - - if (io_resubmit_prep(req, err)) { - refcount_inc(&req->refs); - io_queue_async_work(req); - } - - percpu_ref_put(&ctx->refs); -} #endif static bool io_rw_reissue(struct io_kiocb *req, long res) @@ -2334,12 +2305,14 @@ static bool io_rw_reissue(struct io_kiocb *req, long res) if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker()) return false; - init_task_work(&req->task_work, io_rw_resubmit); - percpu_ref_get(&req->ctx->refs); + ret = io_sq_thread_acquire_mm(req->ctx, req); - ret = io_req_task_work_add(req, &req->task_work); - if (!ret) + if (io_resubmit_prep(req, ret)) { + refcount_inc(&req->refs); + io_queue_async_work(req); return true; + } + #endif return false; } @@ -2578,7 +2551,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) * IO with EINTR. */ ret = -EINTR; - /* fall through */ + fallthrough; default: kiocb->ki_complete(kiocb, ret, 0); } @@ -2819,22 +2792,15 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, needs_lock); } -static ssize_t io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct iov_iter *iter, - bool needs_lock) +static ssize_t __io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) { void __user *buf = u64_to_user_ptr(req->rw.addr); size_t sqe_len = req->rw.len; ssize_t ret; u8 opcode; - if (req->io) { - struct io_async_rw *iorw = &req->io->rw; - - *iovec = NULL; - return iov_iter_count(&iorw->iter); - } - opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; @@ -2848,10 +2814,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (req->flags & REQ_F_BUFFER_SELECT) { buf = io_rw_buffer_select(req, &sqe_len, needs_lock); - if (IS_ERR(buf)) { - *iovec = NULL; + if (IS_ERR(buf)) return PTR_ERR(buf); - } req->rw.len = sqe_len; } @@ -2879,6 +2843,21 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); } +static ssize_t io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct iov_iter *iter, + bool needs_lock) +{ + if (!req->io) + return __io_import_iovec(rw, req, iovec, iter, needs_lock); + *iovec = NULL; + return iov_iter_count(&req->io->rw.iter); +} + +static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) +{ + return kiocb->ki_filp->f_mode & FMODE_STREAM ? NULL : &kiocb->ki_pos; +} + /* * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. @@ -2914,10 +2893,10 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, if (rw == READ) { nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } else { nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, &kiocb->ki_pos); + iovec.iov_len, io_kiocb_ppos(kiocb)); } if (iov_iter_is_bvec(iter)) @@ -3001,11 +2980,8 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw, ssize_t ret; iorw->iter.iov = iorw->fast_iov; - /* reset ->io around the iovec import, we don't want to use it */ - req->io = NULL; - ret = io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, + ret = __io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov, &iorw->iter, !force_nonblock); - req->io = container_of(iorw, struct io_async_ctx, rw); if (unlikely(ret < 0)) return ret; @@ -3061,7 +3037,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, /* submit ref gets dropped, acquire a new one */ refcount_inc(&req->refs); - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, true); if (unlikely(ret)) { struct task_struct *tsk; @@ -3074,27 +3050,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, return 1; } -static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, - struct wait_page_queue *wait, - wait_queue_func_t func, - void *data) -{ - /* Can't support async wakeup with polled IO */ - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) { - wait->wait.func = func; - wait->wait.private = data; - wait->wait.flags = 0; - INIT_LIST_HEAD(&wait->wait.entry); - kiocb->ki_flags |= IOCB_WAITQ; - kiocb->ki_waitq = wait; - return 0; - } - - return -EOPNOTSUPP; -} - /* * This controls whether a given IO request should be armed for async page * based retry. If we return false here, the request is handed to the async @@ -3109,16 +3064,17 @@ static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb, */ static bool io_rw_should_retry(struct io_kiocb *req) { + struct wait_page_queue *wait = &req->io->rw.wpq; struct kiocb *kiocb = &req->rw.kiocb; - int ret; /* never retry for NOWAIT, we just complete with -EAGAIN */ if (req->flags & REQ_F_NOWAIT) return false; /* Only for buffered IO */ - if (kiocb->ki_flags & IOCB_DIRECT) + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; + /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -3126,14 +3082,15 @@ static bool io_rw_should_retry(struct io_kiocb *req) if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) return false; - ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq, - io_async_buf_func, req); - if (!ret) { - io_get_req_task(req); - return true; - } + wait->wait.func = io_async_buf_func; + wait->wait.private = req; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_waitq = wait; - return false; + io_get_req_task(req); + return true; } static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) @@ -3161,6 +3118,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); io_size = ret; req->result = io_size; ret = 0; @@ -3173,8 +3131,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; - iov_count = iov_iter_count(iter); - ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3186,14 +3143,18 @@ static int io_read(struct io_kiocb *req, bool force_nonblock, ret = 0; goto out_free; } else if (ret == -EAGAIN) { - if (!force_nonblock) + /* IOPOLL retry should happen for io-wq threads */ + if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) goto done; + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (ret) goto out_free; return -EAGAIN; } else if (ret < 0) { - goto out_free; + /* make sure -ERESTARTSYS -> -EINTR is done */ + goto done; } /* read it all, or we did blocking attempt. no retry. */ @@ -3238,6 +3199,7 @@ done: kiocb_done(kiocb, ret, cs); ret = 0; out_free: + /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); return ret; @@ -3276,6 +3238,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); if (ret < 0) return ret; + iov_count = iov_iter_count(iter); io_size = ret; req->result = io_size; @@ -3292,8 +3255,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, (req->flags & REQ_F_ISREG)) goto copy_iov; - iov_count = iov_iter_count(iter); - ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count); if (unlikely(ret)) goto out_free; @@ -3326,14 +3288,20 @@ static int io_write(struct io_kiocb *req, bool force_nonblock, if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { + /* IOPOLL retry should happen for io-wq threads */ + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) + goto copy_iov; kiocb_done(kiocb, ret2, cs); } else { copy_iov: + /* some cases will consume bytes even on error returns */ + iov_iter_revert(iter, iov_count - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); if (!ret) return -EAGAIN; } out_free: + /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); return ret; @@ -4600,6 +4568,7 @@ struct io_poll_table { static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { + bool twa_signal_ok; int ret; /* for instances that support it check for an event match first: */ @@ -4615,12 +4584,20 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, percpu_ref_get(&req->ctx->refs); /* + * If we using the signalfd wait_queue_head for this wakeup, then + * it's not safe to use TWA_SIGNAL as we could be recursing on the + * tsk->sighand->siglock on doing the wakeup. Should not be needed + * either, as the normal wakeup will suffice. + */ + twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh); + + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = io_req_task_work_add(req, &req->task_work); + ret = io_req_task_work_add(req, &req->task_work, twa_signal_ok); if (unlikely(ret)) { struct task_struct *tsk; @@ -4909,12 +4886,20 @@ static bool io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask, ret; + int rw; if (!req->file || !file_can_poll(req->file)) return false; if (req->flags & REQ_F_POLLED) return false; - if (!def->pollin && !def->pollout) + if (def->pollin) + rw = READ; + else if (def->pollout) + rw = WRITE; + else + return false; + /* if we can't nonblock try, then no point in arming a poll handler */ + if (!io_file_supports_async(req->file, rw)) return false; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); @@ -5653,6 +5638,18 @@ static void __io_clean_op(struct io_kiocb *req) } req->flags &= ~REQ_F_NEED_CLEANUP; } + + if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + list_del(&req->inflight_entry); + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + req->flags &= ~REQ_F_INFLIGHT; + } } static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -7455,9 +7452,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - mmgrab(current->mm); - ctx->sqo_mm = current->mm; - if (ctx->flags & IORING_SETUP_SQPOLL) { ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) @@ -7502,10 +7496,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, return 0; err: io_finish_async(ctx); - if (ctx->sqo_mm) { - mmdrop(ctx->sqo_mm); - ctx->sqo_mm = NULL; - } return ret; } @@ -7979,7 +7969,13 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) ACCT_LOCKED); INIT_WORK(&ctx->exit_work, io_ring_exit_work); - queue_work(system_wq, &ctx->exit_work); + /* + * Use system_unbound_wq to avoid spawning tons of event kworkers + * if we're exiting a ton of rings at the same time. It just adds + * noise and overhead, there's no discernable change in runtime + * over using system_wq. + */ + queue_work(system_unbound_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) @@ -8063,6 +8059,33 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx, return found; } +static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +{ + return io_match_link(container_of(work, struct io_kiocb, work), data); +} + +static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + enum io_wq_cancel cret; + + /* cancel this particular work, if it's running */ + cret = io_wq_cancel_work(ctx->io_wq, &req->work); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* find links that hold this pending, cancel those */ + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* if we have a poll link holding this pending, cancel that */ + if (io_poll_remove_link(ctx, req)) + return; + + /* final option, timeout link is holding this req pending */ + io_timeout_remove_link(ctx, req); +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { @@ -8094,35 +8117,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, /* We need to keep going until we don't find a matching req */ if (!cancel_req) break; - - if (cancel_req->flags & REQ_F_OVERFLOW) { - spin_lock_irq(&ctx->completion_lock); - list_del(&cancel_req->compl.list); - cancel_req->flags &= ~REQ_F_OVERFLOW; - - io_cqring_mark_overflow(ctx); - WRITE_ONCE(ctx->rings->cq_overflow, - atomic_inc_return(&ctx->cached_cq_overflow)); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - - /* - * Put inflight ref and overflow ref. If that's - * all we had, then we're done with this request. - */ - if (refcount_sub_and_test(2, &cancel_req->refs)) { - io_free_req(cancel_req); - finish_wait(&ctx->inflight_wait, &wait); - continue; - } - } else { - io_wq_cancel_work(ctx->io_wq, &cancel_req->work); - /* could be a link, check and remove if it is */ - if (!io_poll_remove_link(ctx, cancel_req)) - io_timeout_remove_link(ctx, cancel_req); - io_put_req(cancel_req); - } - + /* cancel this request, or head link requests */ + io_attempt_cancel(ctx, cancel_req); + io_put_req(cancel_req); schedule(); finish_wait(&ctx->inflight_wait, &wait); } @@ -8548,6 +8545,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->user = user; ctx->creds = get_current_cred(); + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + /* * Account memory _before_ installing the file descriptor. Once * the descriptor is installed, it can get closed at any time. Also |