summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c699
1 files changed, 402 insertions, 297 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 358f97be9c7b..381d50becd04 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -186,14 +186,23 @@ struct fixed_file_table {
struct file **files;
};
+struct fixed_file_ref_node {
+ struct percpu_ref refs;
+ struct list_head node;
+ struct list_head file_list;
+ struct fixed_file_data *file_data;
+ struct work_struct work;
+};
+
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
+ struct percpu_ref *cur_refs;
struct percpu_ref refs;
- struct llist_head put_llist;
- struct work_struct ref_work;
struct completion done;
+ struct list_head ref_list;
+ spinlock_t lock;
};
struct io_buffer {
@@ -317,6 +326,8 @@ struct io_ring_ctx {
spinlock_t inflight_lock;
struct list_head inflight_list;
} ____cacheline_aligned_in_smp;
+
+ struct work_struct exit_work;
};
/*
@@ -346,7 +357,6 @@ struct io_timeout_data {
struct hrtimer timer;
struct timespec64 ts;
enum hrtimer_mode mode;
- u32 seq_offset;
};
struct io_accept {
@@ -374,7 +384,7 @@ struct io_timeout {
struct file *file;
u64 addr;
int flags;
- unsigned count;
+ u32 count;
};
struct io_rw {
@@ -497,6 +507,7 @@ enum {
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+ REQ_F_LINK_HEAD_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
@@ -532,6 +543,8 @@ enum {
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
+ /* head of a link */
+ REQ_F_LINK_HEAD = BIT(REQ_F_LINK_HEAD_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
@@ -599,6 +612,7 @@ struct io_kiocb {
};
struct io_async_ctx *io;
+ int cflags;
bool needs_fixed_file;
u8 opcode;
@@ -606,10 +620,8 @@ struct io_kiocb {
struct list_head list;
unsigned int flags;
refcount_t refs;
- union {
- struct task_struct *task;
- unsigned long fsize;
- };
+ struct task_struct *task;
+ unsigned long fsize;
u64 user_data;
u32 result;
u32 sequence;
@@ -618,6 +630,8 @@ struct io_kiocb {
struct list_head inflight_entry;
+ struct percpu_ref *fixed_file_refs;
+
union {
/*
* Only commands that never go async can use the below fields,
@@ -629,7 +643,6 @@ struct io_kiocb {
struct callback_head task_work;
struct hlist_node hash_node;
struct async_poll *apoll;
- int cflags;
};
struct io_wq_work work;
};
@@ -848,7 +861,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
-static void io_ring_file_ref_flush(struct fixed_file_data *data);
static void io_cleanup_req(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
@@ -945,8 +957,8 @@ static inline bool __req_need_defer(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
- + atomic_read(&ctx->cached_cq_overflow);
+ return req->sequence != ctx->cached_cq_tail
+ + atomic_read(&ctx->cached_cq_overflow);
}
static inline bool req_need_defer(struct io_kiocb *req)
@@ -1285,8 +1297,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
return NULL;
}
-static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
- struct io_submit_state *state)
+static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
+ struct io_submit_state *state)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
@@ -1319,41 +1331,20 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req = state->reqs[state->free_reqs];
}
-got_it:
- req->io = NULL;
- req->file = NULL;
- req->ctx = ctx;
- req->flags = 0;
- /* one is dropped after submission, the other at completion */
- refcount_set(&req->refs, 2);
- req->result = 0;
- INIT_IO_WORK(&req->work, io_wq_submit_work);
return req;
fallback:
- req = io_get_fallback_req(ctx);
- if (req)
- goto got_it;
- percpu_ref_put(&ctx->refs);
- return NULL;
+ return io_get_fallback_req(ctx);
}
static inline void io_put_file(struct io_kiocb *req, struct file *file,
bool fixed)
{
if (fixed)
- percpu_ref_put(&req->ctx->file_data->refs);
+ percpu_ref_put(req->fixed_file_refs);
else
fput(file);
}
-static void __io_req_do_free(struct io_kiocb *req)
-{
- if (likely(!io_is_fallback_req(req)))
- kmem_cache_free(req_cachep, req);
- else
- clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
-}
-
static void __io_req_aux_free(struct io_kiocb *req)
{
if (req->flags & REQ_F_NEED_CLEANUP)
@@ -1362,6 +1353,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
kfree(req->io);
if (req->file)
io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
+ if (req->task)
+ put_task_struct(req->task);
io_req_work_drop_env(req);
}
@@ -1382,7 +1375,10 @@ static void __io_free_req(struct io_kiocb *req)
}
percpu_ref_put(&req->ctx->refs);
- __io_req_do_free(req);
+ if (likely(!io_is_fallback_req(req)))
+ kmem_cache_free(req_cachep, req);
+ else
+ clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
}
struct req_batch {
@@ -1393,21 +1389,18 @@ struct req_batch {
static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
{
- int fixed_refs = rb->to_free;
-
if (!rb->to_free)
return;
if (rb->need_iter) {
int i, inflight = 0;
unsigned long flags;
- fixed_refs = 0;
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_FIXED_FILE) {
req->file = NULL;
- fixed_refs++;
+ percpu_ref_put(req->fixed_file_refs);
}
if (req->flags & REQ_F_INFLIGHT)
inflight++;
@@ -1433,8 +1426,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
}
do_free:
kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
- if (fixed_refs)
- percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
percpu_ref_put_many(&ctx->refs, rb->to_free);
rb->to_free = rb->need_iter = 0;
}
@@ -1448,7 +1439,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx);
- req->flags &= ~REQ_F_LINK;
+ req->flags &= ~REQ_F_LINK_HEAD;
io_put_req(req);
return true;
}
@@ -1484,7 +1475,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
list_del_init(&req->link_list);
if (!list_empty(&nxt->link_list))
- nxt->flags |= REQ_F_LINK;
+ nxt->flags |= REQ_F_LINK_HEAD;
*nxtptr = nxt;
break;
}
@@ -1495,7 +1486,7 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
}
/*
- * Called if REQ_F_LINK is set, and we fail the head request
+ * Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
static void io_fail_links(struct io_kiocb *req)
{
@@ -1528,7 +1519,7 @@ static void io_fail_links(struct io_kiocb *req)
static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
{
- if (likely(!(req->flags & REQ_F_LINK)))
+ if (likely(!(req->flags & REQ_F_LINK_HEAD)))
return;
/*
@@ -1680,7 +1671,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
{
- if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
+ if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
return false;
if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
@@ -1738,11 +1729,24 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_free_req_many(ctx, &rb);
}
+static void io_iopoll_queue(struct list_head *again)
+{
+ struct io_kiocb *req;
+
+ do {
+ req = list_first_entry(again, struct io_kiocb, list);
+ list_del(&req->list);
+ refcount_inc(&req->refs);
+ io_queue_async_work(req);
+ } while (!list_empty(again));
+}
+
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
long min)
{
struct io_kiocb *req, *tmp;
LIST_HEAD(done);
+ LIST_HEAD(again);
bool spin;
int ret;
@@ -1757,9 +1761,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct kiocb *kiocb = &req->rw.kiocb;
/*
- * Move completed entries to our local list. If we find a
- * request that requires polling, break out and complete
- * the done list first, if we have entries there.
+ * Move completed and retryable entries to our local lists.
+ * If we find a request that requires polling, break out
+ * and complete those lists first, if we have entries there.
*/
if (req->flags & REQ_F_IOPOLL_COMPLETED) {
list_move_tail(&req->list, &done);
@@ -1768,6 +1772,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
break;
+ if (req->result == -EAGAIN) {
+ list_move_tail(&req->list, &again);
+ continue;
+ }
+ if (!list_empty(&again))
+ break;
+
ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
if (ret < 0)
break;
@@ -1780,6 +1791,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
if (!list_empty(&done))
io_iopoll_complete(ctx, nr_events, &done);
+ if (!list_empty(&again))
+ io_iopoll_queue(&again);
+
return ret;
}
@@ -2465,8 +2479,9 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
req->io->rw.iov = iovec;
if (!req->io->rw.iov) {
req->io->rw.iov = req->io->rw.fast_iov;
- memcpy(req->io->rw.iov, fast_iov,
- sizeof(struct iovec) * iter->nr_segs);
+ if (req->io->rw.iov != fast_iov)
+ memcpy(req->io->rw.iov, fast_iov,
+ sizeof(struct iovec) * iter->nr_segs);
} else {
req->flags |= REQ_F_NEED_CLEANUP;
}
@@ -2549,7 +2564,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
req->result = 0;
io_size = ret;
- if (req->flags & REQ_F_LINK)
+ if (req->flags & REQ_F_LINK_HEAD)
req->result = io_size;
/*
@@ -2640,7 +2655,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
req->result = 0;
io_size = ret;
- if (req->flags & REQ_F_LINK)
+ if (req->flags & REQ_F_LINK_HEAD)
req->result = io_size;
/*
@@ -2747,7 +2762,7 @@ static bool io_splice_punt(struct file *file)
return false;
if (!io_file_supports_async(file))
return true;
- return !(file->f_mode & O_NONBLOCK);
+ return !(file->f_flags & O_NONBLOCK);
}
static int io_splice(struct io_kiocb *req, bool force_nonblock)
@@ -2920,7 +2935,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -2929,6 +2944,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->open.how.mode = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.how.flags = READ_ONCE(sqe->open_flags);
+ if (force_o_largefile())
+ req->open.how.flags |= O_LARGEFILE;
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
@@ -2951,7 +2968,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -3305,7 +3322,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
@@ -3382,7 +3399,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
- if (sqe->flags & IOSQE_FIXED_FILE)
+ if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
@@ -3481,14 +3498,11 @@ static void __io_sync_file_range(struct io_kiocb *req)
static void io_sync_file_range_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
- struct io_kiocb *nxt = NULL;
if (io_req_cancelled(req))
return;
__io_sync_file_range(req);
io_put_req(req); /* put submission ref */
- if (nxt)
- io_wq_assign_next(workptr, nxt);
}
static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
@@ -4114,6 +4128,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
struct task_struct *tsk;
+ int ret;
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
@@ -4127,29 +4142,70 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
req->result = mask;
init_task_work(&req->task_work, func);
/*
- * If this fails, then the task is exiting. If that is the case, then
- * the exit check will ultimately cancel these work items. Hence we
- * don't need to check here and handle it specifically.
+ * If this fails, then the task is exiting. Punt to one of the io-wq
+ * threads to ensure the work gets run, we can't always rely on exit
+ * cancelation taking care of this.
*/
- task_work_add(tsk, &req->task_work, true);
+ ret = task_work_add(tsk, &req->task_work, true);
+ if (unlikely(ret)) {
+ tsk = io_wq_get_task(req->ctx->io_wq);
+ task_work_add(tsk, &req->task_work, true);
+ }
wake_up_process(tsk);
return 1;
}
+static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
+ __acquires(&req->ctx->completion_lock)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ struct poll_table_struct pt = { ._key = poll->events };
+
+ req->result = vfs_poll(req->file, &pt) & poll->events;
+ }
+
+ spin_lock_irq(&ctx->completion_lock);
+ if (!req->result && !READ_ONCE(poll->canceled)) {
+ add_wait_queue(poll->head, &poll->wait);
+ return true;
+ }
+
+ return false;
+}
+
static void io_async_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
struct async_poll *apoll = req->apoll;
struct io_ring_ctx *ctx = req->ctx;
+ bool canceled;
trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
- WARN_ON_ONCE(!list_empty(&req->apoll->poll.wait.entry));
+ if (io_poll_rewait(req, &apoll->poll)) {
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
- if (hash_hashed(&req->hash_node)) {
- spin_lock_irq(&ctx->completion_lock);
+ if (hash_hashed(&req->hash_node))
hash_del(&req->hash_node);
- spin_unlock_irq(&ctx->completion_lock);
+
+ canceled = READ_ONCE(apoll->poll.canceled);
+ if (canceled) {
+ io_cqring_fill_event(req, -ECANCELED);
+ io_commit_cqring(ctx);
+ }
+
+ spin_unlock_irq(&ctx->completion_lock);
+
+ if (canceled) {
+ kfree(apoll);
+ io_cqring_ev_posted(ctx);
+ req_set_fail_links(req);
+ io_put_req(req);
+ return;
}
/* restore ->work in case we need to retry again */
@@ -4251,10 +4307,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
req->flags |= REQ_F_POLLED;
memcpy(&apoll->work, &req->work, sizeof(req->work));
- /*
- * Don't need a reference here, as we're adding it to the task
- * task_works list. If the task exits, the list is pruned.
- */
+ get_task_struct(current);
req->task = current;
req->apoll = apoll;
INIT_HLIST_NODE(&req->hash_node);
@@ -4301,11 +4354,13 @@ static bool __io_poll_remove_one(struct io_kiocb *req,
static bool io_poll_remove_one(struct io_kiocb *req)
{
+ struct async_poll *apoll = NULL;
bool do_complete;
if (req->opcode == IORING_OP_POLL_ADD) {
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
+ apoll = req->apoll;
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &req->apoll->poll);
if (do_complete)
@@ -4314,6 +4369,14 @@ static bool io_poll_remove_one(struct io_kiocb *req)
hash_del(&req->hash_node);
+ if (apoll) {
+ /*
+ * restore ->work because we need to call io_req_work_drop_env.
+ */
+ memcpy(&req->work, &apoll->work, sizeof(req->work));
+ kfree(apoll);
+ }
+
if (do_complete) {
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
@@ -4328,7 +4391,7 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
{
struct hlist_node *tmp;
struct io_kiocb *req;
- int i;
+ int posted = 0, i;
spin_lock_irq(&ctx->completion_lock);
for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
@@ -4336,11 +4399,12 @@ static void io_poll_remove_all(struct io_ring_ctx *ctx)
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node)
- io_poll_remove_one(req);
+ posted += io_poll_remove_one(req);
}
spin_unlock_irq(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
+ if (posted)
+ io_cqring_ev_posted(ctx);
}
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
@@ -4407,8 +4471,13 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
{
struct io_ring_ctx *ctx = req->ctx;
+ struct io_poll_iocb *poll = &req->poll;
+
+ if (io_poll_rewait(req, poll)) {
+ spin_unlock_irq(&ctx->completion_lock);
+ return;
+ }
- spin_lock_irq(&ctx->completion_lock);
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
req->flags |= REQ_F_COMP_LOCKED;
@@ -4465,10 +4534,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
- /*
- * Don't need a reference here, as we're adding it to the task
- * task_works list. If the task exits, the list is pruned.
- */
+ get_task_struct(current);
req->task = current;
return 0;
}
@@ -4642,11 +4708,12 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_timeout(struct io_kiocb *req)
{
- unsigned count;
struct io_ring_ctx *ctx = req->ctx;
struct io_timeout_data *data;
struct list_head *entry;
unsigned span = 0;
+ u32 count = req->timeout.count;
+ u32 seq = req->sequence;
data = &req->io->timeout;
@@ -4655,7 +4722,6 @@ static int io_timeout(struct io_kiocb *req)
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- count = req->timeout.count;
if (!count) {
req->flags |= REQ_F_TIMEOUT_NOSEQ;
spin_lock_irq(&ctx->completion_lock);
@@ -4663,8 +4729,7 @@ static int io_timeout(struct io_kiocb *req)
goto add;
}
- req->sequence = ctx->cached_sq_head + count - 1;
- data->seq_offset = count;
+ req->sequence = seq + count;
/*
* Insertion sort, ensuring the first entry in the list is always
@@ -4673,26 +4738,26 @@ static int io_timeout(struct io_kiocb *req)
spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- unsigned nxt_sq_head;
+ unsigned nxt_seq;
long long tmp, tmp_nxt;
- u32 nxt_offset = nxt->io->timeout.seq_offset;
+ u32 nxt_offset = nxt->timeout.count;
if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
continue;
/*
- * Since cached_sq_head + count - 1 can overflow, use type long
+ * Since seq + count can overflow, use type long
* long to store it.
*/
- tmp = (long long)ctx->cached_sq_head + count - 1;
- nxt_sq_head = nxt->sequence - nxt_offset + 1;
- tmp_nxt = (long long)nxt_sq_head + nxt_offset - 1;
+ tmp = (long long)seq + count;
+ nxt_seq = nxt->sequence - nxt_offset;
+ tmp_nxt = (long long)nxt_seq + nxt_offset;
/*
* cached_sq_head may overflow, and it will never overflow twice
* once there is some timeout req still be valid.
*/
- if (ctx->cached_sq_head < nxt_sq_head)
+ if (seq < nxt_seq)
tmp += UINT_MAX;
if (tmp > tmp_nxt)
@@ -5331,7 +5396,8 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
file = io_file_from_index(ctx, fd);
if (!file)
return -EBADF;
- percpu_ref_get(&ctx->file_data->refs);
+ req->fixed_file_refs = ctx->file_data->cur_refs;
+ percpu_ref_get(req->fixed_file_refs);
} else {
trace_io_uring_file_get(ctx, fd);
file = __io_file_get(state, fd);
@@ -5344,15 +5410,10 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+ int fd, unsigned int flags)
{
- unsigned flags;
- int fd;
bool fixed;
- flags = READ_ONCE(sqe->flags);
- fd = READ_ONCE(sqe->fd);
-
if (!io_req_needs_file(req, fd))
return 0;
@@ -5457,7 +5518,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
struct io_kiocb *nxt;
- if (!(req->flags & REQ_F_LINK))
+ if (!(req->flags & REQ_F_LINK_HEAD))
return NULL;
/* for polled retry, if flag is set, we already went through here */
if (req->flags & REQ_F_POLLED)
@@ -5585,53 +5646,11 @@ static inline void io_queue_link_head(struct io_kiocb *req)
io_queue_sqe(req, NULL);
}
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
- IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
- IOSQE_BUFFER_SELECT)
-
-static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned int sqe_flags;
- int ret, id;
-
- sqe_flags = READ_ONCE(sqe->flags);
-
- /* enforce forwards compatibility on users */
- if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
- ret = -EINVAL;
- goto err_req;
- }
-
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[req->opcode].buffer_select) {
- ret = -EOPNOTSUPP;
- goto err_req;
- }
-
- id = READ_ONCE(sqe->personality);
- if (id) {
- req->work.creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!req->work.creds)) {
- ret = -EINVAL;
- goto err_req;
- }
- get_cred(req->work.creds);
- }
-
- /* same numerical values with corresponding REQ_F_*, safe to copy */
- req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
- IOSQE_ASYNC | IOSQE_FIXED_FILE |
- IOSQE_BUFFER_SELECT);
-
- ret = io_req_set_file(state, req, sqe);
- if (unlikely(ret)) {
-err_req:
- io_cqring_add_event(req, ret);
- io_double_put_req(req);
- return false;
- }
+ int ret;
/*
* If we already have a head request, queue this one for async
@@ -5650,42 +5669,39 @@ err_req:
* next after the link request. The last one is done via
* drain_next flag to persist the effect across calls.
*/
- if (sqe_flags & IOSQE_IO_DRAIN) {
+ if (req->flags & REQ_F_IO_DRAIN) {
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
- if (io_alloc_async_ctx(req)) {
- ret = -EAGAIN;
- goto err_req;
- }
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
ret = io_req_defer_prep(req, sqe);
if (ret) {
/* fail even hard links since we don't submit */
head->flags |= REQ_F_FAIL_LINK;
- goto err_req;
+ return ret;
}
trace_io_uring_link(ctx, req, head);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
- if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
+ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
io_queue_link_head(head);
*link = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
- req->ctx->drain_next = 0;
+ ctx->drain_next = 0;
}
- if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
- req->flags |= REQ_F_LINK;
+ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
+ req->flags |= REQ_F_LINK_HEAD;
INIT_LIST_HEAD(&req->link_list);
- if (io_alloc_async_ctx(req)) {
- ret = -EAGAIN;
- goto err_req;
- }
+ if (io_alloc_async_ctx(req))
+ return -EAGAIN;
+
ret = io_req_defer_prep(req, sqe);
if (ret)
req->flags |= REQ_F_FAIL_LINK;
@@ -5695,7 +5711,7 @@ err_req:
}
}
- return true;
+ return 0;
}
/*
@@ -5741,8 +5757,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
-static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe **sqe_ptr)
+static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
u32 *sq_array = ctx->sq_array;
unsigned head;
@@ -5756,35 +5771,91 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
* though the application is the one updating it.
*/
head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
- if (likely(head < ctx->sq_entries)) {
- /*
- * All io need record the previous position, if LINK vs DARIN,
- * it can be used to mark the position of the first IO in the
- * link list.
- */
- req->sequence = ctx->cached_sq_head;
- *sqe_ptr = &ctx->sq_sqes[head];
- req->opcode = READ_ONCE((*sqe_ptr)->opcode);
- req->user_data = READ_ONCE((*sqe_ptr)->user_data);
- ctx->cached_sq_head++;
- return true;
- }
+ if (likely(head < ctx->sq_entries))
+ return &ctx->sq_sqes[head];
/* drop invalid entries */
- ctx->cached_sq_head++;
ctx->cached_sq_dropped++;
WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
- return false;
+ return NULL;
+}
+
+static inline void io_consume_sqe(struct io_ring_ctx *ctx)
+{
+ ctx->cached_sq_head++;
+}
+
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
+ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
+ IOSQE_BUFFER_SELECT)
+
+static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ const struct io_uring_sqe *sqe,
+ struct io_submit_state *state, bool async)
+{
+ unsigned int sqe_flags;
+ int id, fd;
+
+ /*
+ * All io need record the previous position, if LINK vs DARIN,
+ * it can be used to mark the position of the first IO in the
+ * link list.
+ */
+ req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
+ req->opcode = READ_ONCE(sqe->opcode);
+ req->user_data = READ_ONCE(sqe->user_data);
+ req->io = NULL;
+ req->file = NULL;
+ req->ctx = ctx;
+ req->flags = 0;
+ /* one is dropped after submission, the other at completion */
+ refcount_set(&req->refs, 2);
+ req->task = NULL;
+ req->result = 0;
+ req->needs_fixed_file = async;
+ INIT_IO_WORK(&req->work, io_wq_submit_work);
+
+ if (unlikely(req->opcode >= IORING_OP_LAST))
+ return -EINVAL;
+
+ if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+ if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+ return -EFAULT;
+ use_mm(ctx->sqo_mm);
+ }
+
+ sqe_flags = READ_ONCE(sqe->flags);
+ /* enforce forwards compatibility on users */
+ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
+ return -EINVAL;
+
+ if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
+ !io_op_defs[req->opcode].buffer_select)
+ return -EOPNOTSUPP;
+
+ id = READ_ONCE(sqe->personality);
+ if (id) {
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds))
+ return -EINVAL;
+ get_cred(req->work.creds);
+ }
+
+ /* same numerical values with corresponding REQ_F_*, safe to copy */
+ req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK |
+ IOSQE_ASYNC | IOSQE_FIXED_FILE |
+ IOSQE_BUFFER_SELECT | IOSQE_IO_LINK);
+
+ fd = READ_ONCE(sqe->fd);
+ return io_req_set_file(state, req, fd, sqe_flags);
}
static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
- struct file *ring_file, int ring_fd,
- struct mm_struct **mm, bool async)
+ struct file *ring_file, int ring_fd, bool async)
{
struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
int i, submitted = 0;
- bool mm_fault = false;
/* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
@@ -5812,43 +5883,35 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
struct io_kiocb *req;
int err;
- req = io_get_req(ctx, statep);
+ sqe = io_get_sqe(ctx);
+ if (unlikely(!sqe)) {
+ io_consume_sqe(ctx);
+ break;
+ }
+ req = io_alloc_req(ctx, statep);
if (unlikely(!req)) {
if (!submitted)
submitted = -EAGAIN;
break;
}
- if (!io_get_sqring(ctx, req, &sqe)) {
- __io_req_do_free(req);
- break;
- }
+ err = io_init_req(ctx, req, sqe, statep, async);
+ io_consume_sqe(ctx);
/* will complete beyond this point, count as submitted */
submitted++;
- if (unlikely(req->opcode >= IORING_OP_LAST)) {
- err = -EINVAL;
+ if (unlikely(err)) {
fail_req:
io_cqring_add_event(req, err);
io_double_put_req(req);
break;
}
- if (io_op_defs[req->opcode].needs_mm && !*mm) {
- mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
- if (unlikely(mm_fault)) {
- err = -EFAULT;
- goto fail_req;
- }
- use_mm(ctx->sqo_mm);
- *mm = ctx->sqo_mm;
- }
-
- req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, async);
- if (!io_submit_sqe(req, sqe, statep, &link))
- break;
+ err = io_submit_sqe(req, sqe, statep, &link);
+ if (err)
+ goto fail_req;
}
if (unlikely(submitted != nr)) {
@@ -5867,10 +5930,19 @@ fail_req:
return submitted;
}
+static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (mm) {
+ unuse_mm(mm);
+ mmput(mm);
+ }
+}
+
static int io_sq_thread(void *data)
{
struct io_ring_ctx *ctx = data;
- struct mm_struct *cur_mm = NULL;
const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
@@ -5911,11 +5983,7 @@ static int io_sq_thread(void *data)
* adding ourselves to the waitqueue, as the unuse/drop
* may sleep.
*/
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- cur_mm = NULL;
- }
+ io_sq_thread_drop_mm(ctx);
/*
* We're polling. If we're within the defined idle
@@ -5962,6 +6030,7 @@ static int io_sq_thread(void *data)
}
if (current->task_works) {
task_work_run();
+ finish_wait(&ctx->sqo_wait, &wait);
continue;
}
if (signal_pending(current))
@@ -5978,7 +6047,7 @@ static int io_sq_thread(void *data)
}
mutex_lock(&ctx->uring_lock);
- ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+ ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
mutex_unlock(&ctx->uring_lock);
timeout = jiffies + ctx->sq_thread_idle;
}
@@ -5987,10 +6056,7 @@ static int io_sq_thread(void *data)
task_work_run();
set_fs(old_fs);
- if (cur_mm) {
- unuse_mm(cur_mm);
- mmput(cur_mm);
- }
+ io_sq_thread_drop_mm(ctx);
revert_creds(old_cred);
kthread_parkme();
@@ -6124,43 +6190,36 @@ static void io_file_ref_kill(struct percpu_ref *ref)
complete(&data->done);
}
-static void io_file_ref_exit_and_free(struct work_struct *work)
-{
- struct fixed_file_data *data;
-
- data = container_of(work, struct fixed_file_data, ref_work);
-
- /*
- * Ensure any percpu-ref atomic switch callback has run, it could have
- * been in progress when the files were being unregistered. Once
- * that's done, we can safely exit and free the ref and containing
- * data structure.
- */
- rcu_barrier();
- percpu_ref_exit(&data->refs);
- kfree(data);
-}
-
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
+ struct fixed_file_ref_node *ref_node = NULL;
unsigned nr_tables, i;
+ unsigned long flags;
if (!data)
return -ENXIO;
- percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
- flush_work(&data->ref_work);
+ spin_lock_irqsave(&data->lock, flags);
+ if (!list_empty(&data->ref_list))
+ ref_node = list_first_entry(&data->ref_list,
+ struct fixed_file_ref_node, node);
+ spin_unlock_irqrestore(&data->lock, flags);
+ if (ref_node)
+ percpu_ref_kill(&ref_node->refs);
+
+ percpu_ref_kill(&data->refs);
+
+ /* wait for all refs nodes to complete */
wait_for_completion(&data->done);
- io_ring_file_ref_flush(data);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
kfree(data->table);
- INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
- queue_work(system_wq, &data->ref_work);
+ percpu_ref_exit(&data->refs);
+ kfree(data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
@@ -6204,13 +6263,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
struct sk_buff *skb;
int i, nr_files;
- if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
- unsigned long inflight = ctx->user->unix_inflight + nr;
-
- if (inflight > task_rlimit(current, RLIMIT_NOFILE))
- return -EMFILE;
- }
-
fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
if (!fpl)
return -ENOMEM;
@@ -6385,46 +6437,72 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
}
struct io_file_put {
- struct llist_node llist;
+ struct list_head list;
struct file *file;
};
-static void io_ring_file_ref_flush(struct fixed_file_data *data)
+static void io_file_put_work(struct work_struct *work)
{
+ struct fixed_file_ref_node *ref_node;
+ struct fixed_file_data *file_data;
+ struct io_ring_ctx *ctx;
struct io_file_put *pfile, *tmp;
- struct llist_node *node;
+ unsigned long flags;
- while ((node = llist_del_all(&data->put_llist)) != NULL) {
- llist_for_each_entry_safe(pfile, tmp, node, llist) {
- io_ring_file_put(data->ctx, pfile->file);
- kfree(pfile);
- }
+ ref_node = container_of(work, struct fixed_file_ref_node, work);
+ file_data = ref_node->file_data;
+ ctx = file_data->ctx;
+
+ list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
+ list_del_init(&pfile->list);
+ io_ring_file_put(ctx, pfile->file);
+ kfree(pfile);
}
+
+ spin_lock_irqsave(&file_data->lock, flags);
+ list_del_init(&ref_node->node);
+ spin_unlock_irqrestore(&file_data->lock, flags);
+
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
+ percpu_ref_put(&file_data->refs);
}
-static void io_ring_file_ref_switch(struct work_struct *work)
+static void io_file_data_ref_zero(struct percpu_ref *ref)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
+
+ ref_node = container_of(ref, struct fixed_file_ref_node, refs);
- data = container_of(work, struct fixed_file_data, ref_work);
- io_ring_file_ref_flush(data);
- percpu_ref_switch_to_percpu(&data->refs);
+ queue_work(system_wq, &ref_node->work);
}
-static void io_file_data_ref_zero(struct percpu_ref *ref)
+static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
+ struct io_ring_ctx *ctx)
{
- struct fixed_file_data *data;
+ struct fixed_file_ref_node *ref_node;
- data = container_of(ref, struct fixed_file_data, refs);
+ ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
+ if (!ref_node)
+ return ERR_PTR(-ENOMEM);
- /*
- * We can't safely switch from inside this context, punt to wq. If
- * the table ref is going away, the table is being unregistered.
- * Don't queue up the async work for that case, the caller will
- * handle it.
- */
- if (!percpu_ref_is_dying(&data->refs))
- queue_work(system_wq, &data->ref_work);
+ if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
+ 0, GFP_KERNEL)) {
+ kfree(ref_node);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ref_node->node);
+ INIT_LIST_HEAD(&ref_node->file_list);
+ INIT_WORK(&ref_node->work, io_file_put_work);
+ ref_node->file_data = ctx->file_data;
+ return ref_node;
+
+}
+
+static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
+{
+ percpu_ref_exit(&ref_node->refs);
+ kfree(ref_node);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -6435,6 +6513,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
struct file *file;
int fd, ret = 0;
unsigned i;
+ struct fixed_file_ref_node *ref_node;
+ unsigned long flags;
if (ctx->file_data)
return -EBUSY;
@@ -6448,6 +6528,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
ctx->file_data->ctx = ctx;
init_completion(&ctx->file_data->done);
+ INIT_LIST_HEAD(&ctx->file_data->ref_list);
+ spin_lock_init(&ctx->file_data->lock);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
ctx->file_data->table = kcalloc(nr_tables,
@@ -6459,15 +6541,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -ENOMEM;
}
- if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
+ if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
- ctx->file_data->put_llist.first = NULL;
- INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
percpu_ref_exit(&ctx->file_data->refs);
@@ -6530,9 +6610,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
ret = io_sqe_files_scm(ctx);
- if (ret)
+ if (ret) {
io_sqe_files_unregister(ctx);
+ return ret;
+ }
+
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node)) {
+ io_sqe_files_unregister(ctx);
+ return PTR_ERR(ref_node);
+ }
+ ctx->file_data->cur_refs = &ref_node->refs;
+ spin_lock_irqsave(&ctx->file_data->lock, flags);
+ list_add(&ref_node->node, &ctx->file_data->ref_list);
+ spin_unlock_irqrestore(&ctx->file_data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
return ret;
}
@@ -6579,30 +6672,21 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
-static void io_atomic_switch(struct percpu_ref *ref)
-{
- struct fixed_file_data *data;
-
- /*
- * Juggle reference to ensure we hit zero, if needed, so we can
- * switch back to percpu mode
- */
- data = container_of(ref, struct fixed_file_data, refs);
- percpu_ref_put(&data->refs);
- percpu_ref_get(&data->refs);
-}
-
static int io_queue_file_removal(struct fixed_file_data *data,
- struct file *file)
+ struct file *file)
{
struct io_file_put *pfile;
+ struct percpu_ref *refs = data->cur_refs;
+ struct fixed_file_ref_node *ref_node;
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile)
return -ENOMEM;
+ ref_node = container_of(refs, struct fixed_file_ref_node, refs);
pfile->file = file;
- llist_add(&pfile->llist, &data->put_llist);
+ list_add(&pfile->list, &ref_node->file_list);
+
return 0;
}
@@ -6611,17 +6695,23 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
unsigned nr_args)
{
struct fixed_file_data *data = ctx->file_data;
- bool ref_switch = false;
+ struct fixed_file_ref_node *ref_node;
struct file *file;
__s32 __user *fds;
int fd, i, err;
__u32 done;
+ unsigned long flags;
+ bool needs_switch = false;
if (check_add_overflow(up->offset, nr_args, &done))
return -EOVERFLOW;
if (done > ctx->nr_user_files)
return -EINVAL;
+ ref_node = alloc_fixed_file_ref_node(ctx);
+ if (IS_ERR(ref_node))
+ return PTR_ERR(ref_node);
+
done = 0;
fds = u64_to_user_ptr(up->fds);
while (nr_args) {
@@ -6642,7 +6732,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (err)
break;
table->files[index] = NULL;
- ref_switch = true;
+ needs_switch = true;
}
if (fd != -1) {
file = fget(fd);
@@ -6673,11 +6763,19 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
- if (ref_switch)
- percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
+ if (needs_switch) {
+ percpu_ref_kill(data->cur_refs);
+ spin_lock_irqsave(&data->lock, flags);
+ list_add(&ref_node->node, &data->ref_list);
+ data->cur_refs = &ref_node->refs;
+ spin_unlock_irqrestore(&data->lock, flags);
+ percpu_ref_get(&ctx->file_data->refs);
+ } else
+ destroy_fixed_file_ref_node(ref_node);
return done ? done : err;
}
+
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
@@ -7203,6 +7301,18 @@ static int io_remove_personalities(int id, void *p, void *data)
return 0;
}
+static void io_ring_exit_work(struct work_struct *work)
+{
+ struct io_ring_ctx *ctx;
+
+ ctx = container_of(work, struct io_ring_ctx, exit_work);
+ if (ctx->rings)
+ io_cqring_overflow_flush(ctx, true);
+
+ wait_for_completion(&ctx->completions[0]);
+ io_ring_ctx_free(ctx);
+}
+
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
@@ -7230,8 +7340,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
- wait_for_completion(&ctx->completions[0]);
- io_ring_ctx_free(ctx);
+ INIT_WORK(&ctx->exit_work, io_ring_exit_work);
+ queue_work(system_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
@@ -7427,13 +7537,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
wake_up(&ctx->sqo_wait);
submitted = to_submit;
} else if (to_submit) {
- struct mm_struct *cur_mm;
-
mutex_lock(&ctx->uring_lock);
- /* already have mm, so io_submit_sqes() won't try to grab it */
- cur_mm = ctx->sqo_mm;
- submitted = io_submit_sqes(ctx, to_submit, f.file, fd,
- &cur_mm, false);
+ submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false);
mutex_unlock(&ctx->uring_lock);
if (submitted != to_submit)