diff options
-rw-r--r-- | fs/aio.c | 83 |
1 files changed, 63 insertions, 20 deletions
@@ -181,8 +181,9 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool done; bool cancelled; + bool work_scheduled; + bool work_need_resched; struct wait_queue_entry wait; struct work_struct work; }; @@ -1638,14 +1639,26 @@ static void aio_poll_complete_work(struct work_struct *work) * avoid further branches in the fast path. */ spin_lock_irq(&ctx->ctx_lock); + spin_lock(&req->head->lock); if (!mask && !READ_ONCE(req->cancelled)) { - add_wait_queue(req->head, &req->wait); + /* + * The request isn't actually ready to be completed yet. + * Reschedule completion if another wakeup came in. + */ + if (req->work_need_resched) { + schedule_work(&req->work); + req->work_need_resched = false; + } else { + req->work_scheduled = false; + } + spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); return; } + list_del_init(&req->wait.entry); + spin_unlock(&req->head->lock); list_del_init(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - req->done = true; spin_unlock_irq(&ctx->ctx_lock); iocb_put(iocb); @@ -1659,9 +1672,9 @@ static int aio_poll_cancel(struct kiocb *iocb) spin_lock(&req->head->lock); WRITE_ONCE(req->cancelled, true); - if (!list_empty(&req->wait.entry)) { - list_del_init(&req->wait.entry); + if (!req->work_scheduled) { schedule_work(&aiocb->poll.work); + req->work_scheduled = true; } spin_unlock(&req->head->lock); @@ -1680,20 +1693,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & req->events)) return 0; - list_del_init(&req->wait.entry); - - if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { + /* + * Complete the request inline if possible. This requires that three + * conditions be met: + * 1. An event mask must have been passed. If a plain wakeup was done + * instead, then mask == 0 and we have to call vfs_poll() to get + * the events, so inline completion isn't possible. + * 2. The completion work must not have already been scheduled. + * 3. ctx_lock must not be busy. We have to use trylock because we + * already hold the waitqueue lock, so this inverts the normal + * locking order. Use irqsave/irqrestore because not all + * filesystems (e.g. fuse) call this function with IRQs disabled, + * yet IRQs have to be disabled before ctx_lock is obtained. + */ + if (mask && !req->work_scheduled && + spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { struct kioctx *ctx = iocb->ki_ctx; - /* - * Try to complete the iocb inline if we can. Use - * irqsave/irqrestore because not all filesystems (e.g. fuse) - * call this function with IRQs disabled and because IRQs - * have to be disabled before ctx_lock is obtained. - */ + list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - req->done = true; if (iocb->ki_eventfd && eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); @@ -1703,7 +1722,20 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (iocb) iocb_put(iocb); } else { - schedule_work(&req->work); + /* + * Schedule the completion work if needed. If it was already + * scheduled, record that another wakeup came in. + * + * Don't remove the request from the waitqueue here, as it might + * not actually be complete yet (we won't know until vfs_poll() + * is called), and we must not miss any wakeups. + */ + if (req->work_scheduled) { + req->work_need_resched = true; + } else { + schedule_work(&req->work); + req->work_scheduled = true; + } } return 1; } @@ -1750,8 +1782,9 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; - req->done = false; req->cancelled = false; + req->work_scheduled = false; + req->work_need_resched = false; apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; @@ -1766,17 +1799,27 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) spin_lock_irq(&ctx->ctx_lock); if (likely(req->head)) { spin_lock(&req->head->lock); - if (unlikely(list_empty(&req->wait.entry))) { - if (apt.error) + if (list_empty(&req->wait.entry) || req->work_scheduled) { + /* + * aio_poll_wake() already either scheduled the async + * completion work, or completed the request inline. + */ + if (apt.error) /* unsupported case: multiple queues */ cancel = true; apt.error = 0; mask = 0; } if (mask || apt.error) { + /* Steal to complete synchronously. */ list_del_init(&req->wait.entry); } else if (cancel) { + /* Cancel if possible (may be too late though). */ WRITE_ONCE(req->cancelled, true); - } else if (!req->done) { /* actually waiting for an event */ + } else if (!list_empty(&req->wait.entry)) { + /* + * Actually waiting for an event, so add the request to + * active_reqs so that it can be cancelled if needed. + */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); aiocb->ki_cancel = aio_poll_cancel; } |