summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile6
-rw-r--r--io_uring/advise.c16
-rw-r--r--io_uring/cancel.h4
-rw-r--r--io_uring/eventfd.c160
-rw-r--r--io_uring/eventfd.h8
-rw-r--r--io_uring/io-wq.c39
-rw-r--r--io_uring/io-wq.h2
-rw-r--r--io_uring/io_uring.c155
-rw-r--r--io_uring/io_uring.h11
-rw-r--r--io_uring/memmap.c5
-rw-r--r--io_uring/msg_ring.c122
-rw-r--r--io_uring/msg_ring.h1
-rw-r--r--io_uring/napi.c24
-rw-r--r--io_uring/net.c100
-rw-r--r--io_uring/net.h6
-rw-r--r--io_uring/opdef.c39
-rw-r--r--io_uring/opdef.h4
-rw-r--r--io_uring/register.c69
-rw-r--r--io_uring/rsrc.c63
-rw-r--r--io_uring/rw.c9
-rw-r--r--io_uring/statx.c3
-rw-r--r--io_uring/xattr.c4
22 files changed, 536 insertions, 314 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index fc1b23c524e8..61923e11c767 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -4,9 +4,9 @@
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
tctx.o filetable.o rw.o net.o poll.o \
- uring_cmd.o openclose.o sqpoll.o \
- xattr.o nop.o fs.o splice.o sync.o \
- msg_ring.o advise.o openclose.o \
+ eventfd.o uring_cmd.o openclose.o \
+ sqpoll.o xattr.o nop.o fs.o splice.o \
+ sync.o msg_ring.o advise.o openclose.o \
epoll.o statx.o timeout.o fdinfo.o \
cancel.o waitid.o register.o \
truncate.o memmap.o
diff --git a/io_uring/advise.c b/io_uring/advise.c
index 7085804c513c..cb7b881665e5 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -17,14 +17,14 @@
struct io_fadvise {
struct file *file;
u64 offset;
- u32 len;
+ u64 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
- u32 len;
+ u64 len;
u32 advice;
};
@@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
- if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
ma->addr = READ_ONCE(sqe->addr);
- ma->len = READ_ONCE(sqe->len);
+ ma->len = READ_ONCE(sqe->off);
+ if (!ma->len)
+ ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
@@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
- if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
fa->offset = READ_ONCE(sqe->off);
- fa->len = READ_ONCE(sqe->len);
+ fa->len = READ_ONCE(sqe->addr);
+ if (!fa->len)
+ fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice);
if (io_fadvise_force_async(fa))
req->flags |= REQ_F_FORCE_ASYNC;
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index 76b32e65c03c..b33995e00ba9 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -27,10 +27,10 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence)
{
- if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq)
+ if (req->cancel_seq_set && sequence == req->work.cancel_seq)
return true;
- req->flags |= REQ_F_CANCEL_SEQ;
+ req->cancel_seq_set = true;
req->work.cancel_seq = sequence;
return false;
}
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
new file mode 100644
index 000000000000..b9384503a2b7
--- /dev/null
+++ b/io_uring/eventfd.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/eventfd.h>
+#include <linux/eventpoll.h>
+#include <linux/io_uring.h>
+#include <linux/io_uring_types.h>
+
+#include "io-wq.h"
+#include "eventfd.h"
+
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ unsigned int eventfd_async: 1;
+ struct rcu_head rcu;
+ atomic_t refs;
+ atomic_t ops;
+};
+
+enum {
+ IO_EVENTFD_OP_SIGNAL_BIT,
+};
+
+static void io_eventfd_free(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+}
+
+static void io_eventfd_do_signal(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
+
+ if (atomic_dec_and_test(&ev_fd->refs))
+ io_eventfd_free(rcu);
+}
+
+void io_eventfd_signal(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd = NULL;
+
+ if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
+ return;
+
+ guard(rcu)();
+
+ /*
+ * rcu_dereference ctx->io_ev_fd once and use it for both for checking
+ * and eventfd_signal
+ */
+ ev_fd = rcu_dereference(ctx->io_ev_fd);
+
+ /*
+ * Check again if ev_fd exists incase an io_eventfd_unregister call
+ * completed between the NULL check of ctx->io_ev_fd at the start of
+ * the function and rcu_read_lock.
+ */
+ if (unlikely(!ev_fd))
+ return;
+ if (!atomic_inc_not_zero(&ev_fd->refs))
+ return;
+ if (ev_fd->eventfd_async && !io_wq_current_is_worker())
+ goto out;
+
+ if (likely(eventfd_signal_allowed())) {
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
+ } else {
+ if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
+ call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
+ return;
+ }
+ }
+out:
+ if (atomic_dec_and_test(&ev_fd->refs))
+ call_rcu(&ev_fd->rcu, io_eventfd_free);
+}
+
+void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
+{
+ bool skip;
+
+ spin_lock(&ctx->completion_lock);
+
+ /*
+ * Eventfd should only get triggered when at least one event has been
+ * posted. Some applications rely on the eventfd notification count
+ * only changing IFF a new CQE has been added to the CQ ring. There's
+ * no depedency on 1:1 relationship between how many times this
+ * function is called (and hence the eventfd count) and number of CQEs
+ * posted to the CQ ring.
+ */
+ skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
+ ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+ spin_unlock(&ctx->completion_lock);
+ if (skip)
+ return;
+
+ io_eventfd_signal(ctx);
+}
+
+int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async)
+{
+ struct io_ev_fd *ev_fd;
+ __s32 __user *fds = arg;
+ int fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd)
+ return -EBUSY;
+
+ if (copy_from_user(&fd, fds, sizeof(*fds)))
+ return -EFAULT;
+
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
+
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+ return ret;
+ }
+
+ spin_lock(&ctx->completion_lock);
+ ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+ spin_unlock(&ctx->completion_lock);
+
+ ev_fd->eventfd_async = eventfd_async;
+ ctx->has_evfd = true;
+ atomic_set(&ev_fd->refs, 1);
+ atomic_set(&ev_fd->ops, 0);
+ rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
+ return 0;
+}
+
+int io_eventfd_unregister(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd) {
+ ctx->has_evfd = false;
+ rcu_assign_pointer(ctx->io_ev_fd, NULL);
+ if (atomic_dec_and_test(&ev_fd->refs))
+ call_rcu(&ev_fd->rcu, io_eventfd_free);
+ return 0;
+ }
+
+ return -ENXIO;
+}
diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h
new file mode 100644
index 000000000000..d394f49c6321
--- /dev/null
+++ b/io_uring/eventfd.h
@@ -0,0 +1,8 @@
+
+struct io_ring_ctx;
+int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async);
+int io_eventfd_unregister(struct io_ring_ctx *ctx);
+
+void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
+void io_eventfd_signal(struct io_ring_ctx *ctx);
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index d1c47a9d9215..f1e7c670add8 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -23,6 +23,7 @@
#include "io_uring.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
+#define WORKER_INIT_LIMIT 3
enum {
IO_WORKER_F_UP = 0, /* up and active */
@@ -58,6 +59,7 @@ struct io_worker {
unsigned long create_state;
struct callback_head create_work;
+ int init_retries;
union {
struct rcu_head rcu;
@@ -159,7 +161,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound)
static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
struct io_wq_work *work)
{
- return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND));
+ return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND));
}
static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
@@ -451,7 +453,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
- return work->flags >> IO_WQ_HASH_SHIFT;
+ return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT;
}
static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
@@ -592,8 +594,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
next_hashed = wq_next_work(work);
- if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
- work->flags |= IO_WQ_WORK_CANCEL;
+ if (do_kill &&
+ (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND))
+ atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work);
io_assign_current_work(worker, NULL);
@@ -744,7 +747,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
return true;
}
-static inline bool io_should_retry_thread(long err)
+static inline bool io_should_retry_thread(struct io_worker *worker, long err)
{
/*
* Prevent perpetual task_work retry, if the task (or its group) is
@@ -752,6 +755,8 @@ static inline bool io_should_retry_thread(long err)
*/
if (fatal_signal_pending(current))
return false;
+ if (worker->init_retries++ >= WORKER_INIT_LIMIT)
+ return false;
switch (err) {
case -EAGAIN:
@@ -778,7 +783,7 @@ static void create_worker_cont(struct callback_head *cb)
io_init_new_worker(wq, worker, tsk);
io_worker_release(worker);
return;
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+ } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
struct io_wq_acct *acct = io_wq_get_acct(worker);
atomic_dec(&acct->nr_running);
@@ -845,7 +850,7 @@ fail:
tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
if (!IS_ERR(tsk)) {
io_init_new_worker(wq, worker, tsk);
- } else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+ } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
kfree(worker);
goto fail;
} else {
@@ -891,7 +896,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
do {
- work->flags |= IO_WQ_WORK_CANCEL;
+ atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work);
work = wq->free_work(work);
} while (work);
@@ -926,8 +931,12 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
{
struct io_wq_acct *acct = io_work_get_acct(wq, work);
- unsigned long work_flags = work->flags;
- struct io_cb_cancel_data match;
+ unsigned int work_flags = atomic_read(&work->flags);
+ struct io_cb_cancel_data match = {
+ .fn = io_wq_work_match_item,
+ .data = work,
+ .cancel_all = false,
+ };
bool do_create;
/*
@@ -935,7 +944,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
* been marked as one that should not get executed, cancel it here.
*/
if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
- (work->flags & IO_WQ_WORK_CANCEL)) {
+ (work_flags & IO_WQ_WORK_CANCEL)) {
io_run_cancel(work, wq);
return;
}
@@ -965,10 +974,6 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
raw_spin_unlock(&wq->lock);
/* fatal condition, failed to create the first worker */
- match.fn = io_wq_work_match_item,
- match.data = work,
- match.cancel_all = false,
-
io_acct_cancel_pending_work(wq, acct, &match);
}
}
@@ -982,7 +987,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
unsigned int bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER);
- work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
+ atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags);
}
static bool __io_wq_worker_cancel(struct io_worker *worker,
@@ -990,7 +995,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
struct io_wq_work *work)
{
if (work && match->fn(work, match->data)) {
- work->flags |= IO_WQ_WORK_CANCEL;
+ atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
__set_notify_signal(worker->task);
return true;
}
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index 2b2a6406dd8e..b3b004a7b625 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
- return work->flags & IO_WQ_WORK_HASHED;
+ return atomic_read(&work->flags) & IO_WQ_WORK_HASHED;
}
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 816e93e7f949..8e6faa942a6f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -95,12 +95,14 @@
#include "futex.h"
#include "napi.h"
#include "uring_cmd.h"
+#include "msg_ring.h"
#include "memmap.h"
#include "timeout.h"
#include "poll.h"
#include "rw.h"
#include "alloc_cache.h"
+#include "eventfd.h"
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
@@ -314,6 +316,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
sizeof(struct io_async_rw));
ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct uring_cache));
+ spin_lock_init(&ctx->msg_lock);
+ ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
+ sizeof(struct io_kiocb));
ret |= io_futex_cache_init(ctx);
if (ret)
goto err;
@@ -350,6 +355,7 @@ err:
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
+ io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
@@ -461,9 +467,9 @@ static void io_prep_async_work(struct io_kiocb *req)
}
req->work.list.next = NULL;
- req->work.flags = 0;
+ atomic_set(&req->work.flags, 0);
if (req->flags & REQ_F_FORCE_ASYNC)
- req->work.flags |= IO_WQ_WORK_CONCURRENT;
+ atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(req->file);
@@ -479,7 +485,7 @@ static void io_prep_async_work(struct io_kiocb *req)
io_wq_hash_work(&req->work, file_inode(req->file));
} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
if (def->unbound_nonreg_file)
- req->work.flags |= IO_WQ_WORK_UNBOUND;
+ atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags);
}
}
@@ -519,7 +525,7 @@ static void io_queue_iowq(struct io_kiocb *req)
* worker for it).
*/
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
- req->work.flags |= IO_WQ_WORK_CANCEL;
+ atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
@@ -541,84 +547,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
-void io_eventfd_ops(struct rcu_head *rcu)
-{
- struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
- int ops = atomic_xchg(&ev_fd->ops, 0);
-
- if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
- eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
-
- /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
- * ordering in a race but if references are 0 we know we have to free
- * it regardless.
- */
- if (atomic_dec_and_test(&ev_fd->refs)) {
- eventfd_ctx_put(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- }
-}
-
-static void io_eventfd_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd = NULL;
-
- rcu_read_lock();
- /*
- * rcu_dereference ctx->io_ev_fd once and use it for both for checking
- * and eventfd_signal
- */
- ev_fd = rcu_dereference(ctx->io_ev_fd);
-
- /*
- * Check again if ev_fd exists incase an io_eventfd_unregister call
- * completed between the NULL check of ctx->io_ev_fd at the start of
- * the function and rcu_read_lock.
- */
- if (unlikely(!ev_fd))
- goto out;
- if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- goto out;
- if (ev_fd->eventfd_async && !io_wq_current_is_worker())
- goto out;
-
- if (likely(eventfd_signal_allowed())) {
- eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
- } else {
- atomic_inc(&ev_fd->refs);
- if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
- call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
- else
- atomic_dec(&ev_fd->refs);
- }
-
-out:
- rcu_read_unlock();
-}
-
-static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
-{
- bool skip;
-
- spin_lock(&ctx->completion_lock);
-
- /*
- * Eventfd should only get triggered when at least one event has been
- * posted. Some applications rely on the eventfd notification count
- * only changing IFF a new CQE has been added to the CQ ring. There's
- * no depedency on 1:1 relationship between how many times this
- * function is called (and hence the eventfd count) and number of CQEs
- * posted to the CQ ring.
- */
- skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
- if (skip)
- return;
-
- io_eventfd_signal(ctx);
-}
-
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->poll_activated)
@@ -878,20 +806,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
-bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
+static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
+ u32 cflags)
{
bool filled;
- io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+ return filled;
+}
+
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
+{
+ bool filled;
+
+ io_cq_lock(ctx);
+ filled = __io_post_aux_cqe(ctx, user_data, res, cflags);
io_cq_unlock_post(ctx);
return filled;
}
/*
+ * Must be called from inline task_work so we now a flush will happen later,
+ * and obviously with ctx->uring_lock held (tw always has that).
+ */
+void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
+{
+ if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
+ spin_lock(&ctx->completion_lock);
+ io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+ spin_unlock(&ctx->completion_lock);
+ }
+ ctx->submit_state.cq_flush = true;
+}
+
+/*
* A helper for multishot requests posting additional CQEs.
* Should only be used from a task_work including IO_URING_F_MULTISHOT.
*/
@@ -1175,9 +1126,10 @@ void tctx_task_work(struct callback_head *cb)
WARN_ON_ONCE(ret);
}
-static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
+static inline void io_req_local_work_add(struct io_kiocb *req,
+ struct io_ring_ctx *ctx,
+ unsigned flags)
{
- struct io_ring_ctx *ctx = req->ctx;
unsigned nr_wait, nr_tw, nr_tw_prev;
struct llist_node *head;
@@ -1191,6 +1143,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
flags &= ~IOU_F_TWQ_LAZY_WAKE;
+ guard(rcu)();
+
head = READ_ONCE(ctx->work_llist.first);
do {
nr_tw_prev = 0;
@@ -1259,8 +1213,8 @@ static void io_req_normal_work_add(struct io_kiocb *req)
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct io_sq_data *sqd = ctx->sq_data;
- if (wq_has_sleeper(&sqd->wait))
- wake_up(&sqd->wait);
+ if (sqd->thread)
+ __set_notify_signal(sqd->thread);
return;
}
@@ -1272,13 +1226,18 @@ static void io_req_normal_work_add(struct io_kiocb *req)
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
{
- if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
- rcu_read_lock();
- io_req_local_work_add(req, flags);
- rcu_read_unlock();
- } else {
+ if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+ io_req_local_work_add(req, req->ctx, flags);
+ else
io_req_normal_work_add(req);
- }
+}
+
+void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
+ unsigned flags)
+{
+ if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)))
+ return;
+ io_req_local_work_add(req, ctx, flags);
}
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
@@ -1467,7 +1426,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
}
__io_cq_unlock_post(ctx);
- if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
+ if (!wq_list_empty(&state->compl_reqs)) {
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
@@ -1813,14 +1772,14 @@ void io_wq_submit_work(struct io_wq_work *work)
io_arm_ltimeout(req);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
- if (work->flags & IO_WQ_WORK_CANCEL) {
+ if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) {
fail:
io_req_task_queue_fail(req, err);
return;
}
if (!io_assign_file(req, def, issue_flags)) {
err = -EBADF;
- work->flags |= IO_WQ_WORK_CANCEL;
+ atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
goto fail;
}
@@ -2058,6 +2017,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->file = NULL;
req->rsrc_node = NULL;
req->task = current;
+ req->cancel_seq_set = false;
if (unlikely(opcode >= IORING_OP_LAST)) {
req->opcode = 0;
@@ -2648,6 +2608,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
+ io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 624ca9076a50..e1ce908f0679 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -65,6 +65,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
+void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
@@ -73,6 +74,8 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
+void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
+ unsigned flags);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req);
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
@@ -104,12 +107,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
-enum {
- IO_EVENTFD_OP_SIGNAL_BIT,
- IO_EVENTFD_OP_FREE_BIT,
-};
-
-void io_eventfd_ops(struct rcu_head *rcu);
void io_activate_pollwq(struct io_ring_ctx *ctx);
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
@@ -433,7 +430,7 @@ static inline bool io_file_can_poll(struct io_kiocb *req)
{
if (req->flags & REQ_F_CAN_POLL)
return true;
- if (file_can_poll(req->file)) {
+ if (req->file && file_can_poll(req->file)) {
req->flags |= REQ_F_CAN_POLL;
return true;
}
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 4785d6af5fee..a0f32a255fd1 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -244,6 +244,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start;
long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned int npages;
void *ptr;
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
@@ -253,8 +254,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
- return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
- ctx->n_ring_pages);
+ npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
+ return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
case IORING_OFF_SQES:
return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
ctx->n_sqe_pages);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 81c4a9d43729..29fa9285a33d 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -11,9 +11,9 @@
#include "io_uring.h"
#include "rsrc.h"
#include "filetable.h"
+#include "alloc_cache.h"
#include "msg_ring.h"
-
/* All valid masks for MSG_RING */
#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \
IORING_MSG_RING_FLAGS_PASS)
@@ -68,59 +68,70 @@ void io_msg_ring_cleanup(struct io_kiocb *req)
static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx)
{
- if (!target_ctx->task_complete)
- return false;
- return current != target_ctx->submitter_task;
+ return target_ctx->task_complete;
}
-static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func)
+static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
- struct io_ring_ctx *ctx = req->file->private_data;
- struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
- struct task_struct *task = READ_ONCE(ctx->submitter_task);
+ struct io_ring_ctx *ctx = req->ctx;
- if (unlikely(!task))
- return -EOWNERDEAD;
+ io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);
+ if (spin_trylock(&ctx->msg_lock)) {
+ if (io_alloc_cache_put(&ctx->msg_cache, req))
+ req = NULL;
+ spin_unlock(&ctx->msg_lock);
+ }
+ if (req)
+ kmem_cache_free(req_cachep, req);
+ percpu_ref_put(&ctx->refs);
+}
- init_task_work(&msg->tw, func);
- if (task_work_add(task, &msg->tw, TWA_SIGNAL))
+static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
+ int res, u32 cflags, u64 user_data)
+{
+ req->task = READ_ONCE(ctx->submitter_task);
+ if (!req->task) {
+ kmem_cache_free(req_cachep, req);
return -EOWNERDEAD;
+ }
+ req->cqe.user_data = user_data;
+ io_req_set_res(req, res, cflags);
+ percpu_ref_get(&ctx->refs);
+ req->ctx = ctx;
+ req->io_task_work.func = io_msg_tw_complete;
+ io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE);
+ return 0;
+}
- return IOU_ISSUE_SKIP_COMPLETE;
+static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx)
+{
+ struct io_kiocb *req = NULL;
+
+ if (spin_trylock(&ctx->msg_lock)) {
+ req = io_alloc_cache_get(&ctx->msg_cache);
+ spin_unlock(&ctx->msg_lock);
+ }
+ if (req)
+ return req;
+ return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN);
}
-static void io_msg_tw_complete(struct callback_head *head)
+static int io_msg_data_remote(struct io_kiocb *req)
{
- struct io_msg *msg = container_of(head, struct io_msg, tw);
- struct io_kiocb *req = cmd_to_io_kiocb(msg);
struct io_ring_ctx *target_ctx = req->file->private_data;
- int ret = 0;
-
- if (current->flags & PF_EXITING) {
- ret = -EOWNERDEAD;
- } else {
- u32 flags = 0;
-
- if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
- flags = msg->cqe_flags;
-
- /*
- * If the target ring is using IOPOLL mode, then we need to be
- * holding the uring_lock for posting completions. Other ring
- * types rely on the regular completion locking, which is
- * handled while posting.
- */
- if (target_ctx->flags & IORING_SETUP_IOPOLL)
- mutex_lock(&target_ctx->uring_lock);
- if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
- ret = -EOVERFLOW;
- if (target_ctx->flags & IORING_SETUP_IOPOLL)
- mutex_unlock(&target_ctx->uring_lock);
- }
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+ struct io_kiocb *target;
+ u32 flags = 0;
- if (ret < 0)
- req_set_fail(req);
- io_req_queue_tw_complete(req, ret);
+ target = io_msg_get_kiocb(req->ctx);
+ if (unlikely(!target))
+ return -ENOMEM;
+
+ if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
+ flags = msg->cqe_flags;
+
+ return io_msg_remote_post(target_ctx, target, msg->len, flags,
+ msg->user_data);
}
static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
@@ -138,7 +149,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
return -EBADFD;
if (io_msg_need_remote(target_ctx))
- return io_msg_exec_remote(req, io_msg_tw_complete);
+ return io_msg_data_remote(req);
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
@@ -218,6 +229,22 @@ static void io_msg_tw_fd_complete(struct callback_head *head)
io_req_queue_tw_complete(req, ret);
}
+static int io_msg_fd_remote(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+ struct task_struct *task = READ_ONCE(ctx->submitter_task);
+
+ if (unlikely(!task))
+ return -EOWNERDEAD;
+
+ init_task_work(&msg->tw, io_msg_tw_fd_complete);
+ if (task_work_add(task, &msg->tw, TWA_SIGNAL))
+ return -EOWNERDEAD;
+
+ return IOU_ISSUE_SKIP_COMPLETE;
+}
+
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
@@ -240,7 +267,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
}
if (io_msg_need_remote(target_ctx))
- return io_msg_exec_remote(req, io_msg_tw_fd_complete);
+ return io_msg_fd_remote(req);
return io_msg_install_complete(req, issue_flags);
}
@@ -294,3 +321,10 @@ done:
io_req_set_res(req, ret, 0);
return IOU_OK;
}
+
+void io_msg_cache_free(const void *entry)
+{
+ struct io_kiocb *req = (struct io_kiocb *) entry;
+
+ kmem_cache_free(req_cachep, req);
+}
diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h
index 3987ee6c0e5f..3030f3942f0f 100644
--- a/io_uring/msg_ring.h
+++ b/io_uring/msg_ring.h
@@ -3,3 +3,4 @@
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
void io_msg_ring_cleanup(struct io_kiocb *req);
+void io_msg_cache_free(const void *entry);
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 883a1a665907..762254a7ff3f 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -261,12 +261,14 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
}
/*
- * __io_napi_adjust_timeout() - Add napi id to the busy poll list
+ * __io_napi_adjust_timeout() - adjust busy loop timeout
* @ctx: pointer to io-uring context structure
* @iowq: pointer to io wait queue
* @ts: pointer to timespec or NULL
*
* Adjust the busy loop timeout according to timespec and busy poll timeout.
+ * If the specified NAPI timeout is bigger than the wait timeout, then adjust
+ * the NAPI timeout accordingly.
*/
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
struct timespec64 *ts)
@@ -274,16 +276,16 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
if (ts) {
- struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
-
- if (timespec64_compare(ts, &poll_to_ts) > 0) {
- *ts = timespec64_sub(*ts, poll_to_ts);
- } else {
- u64 to = timespec64_to_ns(ts);
-
- do_div(to, 1000);
- ts->tv_sec = 0;
- ts->tv_nsec = 0;
+ struct timespec64 poll_to_ts;
+
+ poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
+ if (timespec64_compare(ts, &poll_to_ts) < 0) {
+ s64 poll_to_ns = timespec64_to_ns(ts);
+ if (poll_to_ns > 0) {
+ u64 val = poll_to_ns + 999;
+ do_div(val, 1000);
+ poll_to = val;
+ }
}
}
diff --git a/io_uring/net.c b/io_uring/net.c
index 0a48596429d9..7b75da2e7826 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -51,6 +51,16 @@ struct io_connect {
bool seen_econnaborted;
};
+struct io_bind {
+ struct file *file;
+ int addr_len;
+};
+
+struct io_listen {
+ struct file *file;
+ int backlog;
+};
+
struct io_sr_msg {
struct file *file;
union {
@@ -817,20 +827,20 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
bool mshot_finished, unsigned issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
- unsigned int cflags;
-
- if (sr->flags & IORING_RECVSEND_BUNDLE)
- cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
- issue_flags);
- else
- cflags = io_put_kbuf(req, issue_flags);
+ unsigned int cflags = 0;
if (kmsg->msg.msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
- /* bundle with no more immediate buffers, we're done */
- if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY)
- goto finish;
+ if (sr->flags & IORING_RECVSEND_BUNDLE) {
+ cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
+ issue_flags);
+ /* bundle with no more immediate buffers, we're done */
+ if (req->flags & REQ_F_BL_EMPTY)
+ goto finish;
+ } else {
+ cflags |= io_put_kbuf(req, issue_flags);
+ }
/*
* Fill CQE for this receive and see if we should keep trying to
@@ -1129,13 +1139,15 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
retry_multishot:
if (io_do_buffer_select(req)) {
ret = io_recv_buf_select(req, kmsg, &len, issue_flags);
- if (unlikely(ret))
+ if (unlikely(ret)) {
+ kmsg->msg.msg_inq = -1;
goto out_free;
+ }
sr->buf = NULL;
}
- kmsg->msg.msg_inq = -1;
kmsg->msg.msg_flags = 0;
+ kmsg->msg.msg_inq = -1;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
@@ -1715,6 +1727,70 @@ out:
return IOU_OK;
}
+int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
+ struct sockaddr __user *uaddr;
+ struct io_async_msghdr *io;
+
+ if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
+ return -EINVAL;
+
+ uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ bind->addr_len = READ_ONCE(sqe->addr2);
+
+ io = io_msg_alloc_async(req);
+ if (unlikely(!io))
+ return -ENOMEM;
+ return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
+}
+
+int io_bind(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
+ struct io_async_msghdr *io = req->async_data;
+ struct socket *sock;
+ int ret;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return 0;
+}
+
+int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
+
+ if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
+ return -EINVAL;
+
+ listen->backlog = READ_ONCE(sqe->len);
+ return 0;
+}
+
+int io_listen(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
+ struct socket *sock;
+ int ret;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_listen_socket(sock, listen->backlog);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return 0;
+}
+
void io_netmsg_cache_free(const void *entry)
{
struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
diff --git a/io_uring/net.h b/io_uring/net.h
index 0eb1c1920fc9..52bfee05f06a 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -49,6 +49,12 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_send_zc_cleanup(struct io_kiocb *req);
+int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_bind(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_listen(struct io_kiocb *req, unsigned int issue_flags);
+
void io_netmsg_cache_free(const void *entry);
#else
static inline void io_netmsg_cache_free(const void *entry)
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 2de5cca9504e..a2be3bbca5ff 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -495,6 +495,26 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_ftruncate_prep,
.issue = io_ftruncate,
},
+ [IORING_OP_BIND] = {
+#if defined(CONFIG_NET)
+ .needs_file = 1,
+ .prep = io_bind_prep,
+ .issue = io_bind,
+ .async_size = sizeof(struct io_async_msghdr),
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
+ [IORING_OP_LISTEN] = {
+#if defined(CONFIG_NET)
+ .needs_file = 1,
+ .prep = io_listen_prep,
+ .issue = io_listen,
+ .async_size = sizeof(struct io_async_msghdr),
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -516,10 +536,12 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_READ_FIXED] = {
.name = "READ_FIXED",
+ .cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_WRITE_FIXED] = {
.name = "WRITE_FIXED",
+ .cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_POLL_ADD] = {
@@ -582,10 +604,12 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_READ] = {
.name = "READ",
+ .cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_WRITE] = {
.name = "WRITE",
+ .cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_FADVISE] = {
@@ -692,6 +716,7 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_READ_MULTISHOT] = {
.name = "READ_MULTISHOT",
+ .cleanup = io_readv_writev_cleanup,
},
[IORING_OP_WAITID] = {
.name = "WAITID",
@@ -711,6 +736,12 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FTRUNCATE] = {
.name = "FTRUNCATE",
},
+ [IORING_OP_BIND] = {
+ .name = "BIND",
+ },
+ [IORING_OP_LISTEN] = {
+ .name = "LISTEN",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
@@ -720,6 +751,14 @@ const char *io_uring_get_opcode(u8 opcode)
return "INVALID";
}
+bool io_uring_op_supported(u8 opcode)
+{
+ if (opcode < IORING_OP_LAST &&
+ io_issue_defs[opcode].prep != io_eopnotsupp_prep)
+ return true;
+ return false;
+}
+
void __init io_uring_optable_init(void)
{
int i;
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 7ee6f5aa90aa..14456436ff74 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -17,8 +17,6 @@ struct io_issue_def {
unsigned poll_exclusive : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
- /* opcode is not supported by this kernel */
- unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
/* supports ioprio */
@@ -47,5 +45,7 @@ struct io_cold_def {
extern const struct io_issue_def io_issue_defs[];
extern const struct io_cold_def io_cold_defs[];
+bool io_uring_op_supported(u8 opcode);
+
void io_uring_optable_init(void);
#endif
diff --git a/io_uring/register.c b/io_uring/register.c
index ef8c908346a4..e3c20be5a198 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -27,65 +27,11 @@
#include "cancel.h"
#include "kbuf.h"
#include "napi.h"
+#include "eventfd.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
- unsigned int eventfd_async)
-{
- struct io_ev_fd *ev_fd;
- __s32 __user *fds = arg;
- int fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd)
- return -EBUSY;
-
- if (copy_from_user(&fd, fds, sizeof(*fds)))
- return -EFAULT;
-
- ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
- if (!ev_fd)
- return -ENOMEM;
-
- ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ev_fd->cq_ev_fd)) {
- int ret = PTR_ERR(ev_fd->cq_ev_fd);
- kfree(ev_fd);
- return ret;
- }
-
- spin_lock(&ctx->completion_lock);
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
-
- ev_fd->eventfd_async = eventfd_async;
- ctx->has_evfd = true;
- rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
- atomic_set(&ev_fd->refs, 1);
- atomic_set(&ev_fd->ops, 0);
- return 0;
-}
-
-int io_eventfd_unregister(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
- lockdep_is_held(&ctx->uring_lock));
- if (ev_fd) {
- ctx->has_evfd = false;
- rcu_assign_pointer(ctx->io_ev_fd, NULL);
- if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
- call_rcu(&ev_fd->rcu, io_eventfd_ops);
- return 0;
- }
-
- return -ENXIO;
-}
-
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
@@ -93,9 +39,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
size_t size;
int i, ret;
+ if (nr_args > IORING_OP_LAST)
+ nr_args = IORING_OP_LAST;
+
size = struct_size(p, ops, nr_args);
- if (size == SIZE_MAX)
- return -EOVERFLOW;
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
@@ -108,12 +55,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
goto out;
p->last_op = IORING_OP_LAST - 1;
- if (nr_args > IORING_OP_LAST)
- nr_args = IORING_OP_LAST;
for (i = 0; i < nr_args; i++) {
p->ops[i].op = i;
- if (!io_issue_defs[i].not_supported)
+ if (io_uring_op_supported(i))
p->ops[i].flags = IO_URING_OP_SUPPORTED;
}
p->ops_len = i;
@@ -355,8 +300,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
}
if (sqd) {
+ mutex_unlock(&ctx->uring_lock);
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
+ mutex_lock(&ctx->uring_lock);
}
if (copy_to_user(arg, new_count, sizeof(new_count)))
@@ -380,8 +327,10 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
return 0;
err:
if (sqd) {
+ mutex_unlock(&ctx->uring_lock);
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
+ mutex_lock(&ctx->uring_lock);
}
return ret;
}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 65417c9553b1..a860516bf448 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -85,31 +85,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0;
}
-static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
- void __user *arg, unsigned index)
-{
- struct iovec __user *src;
-
-#ifdef CONFIG_COMPAT
- if (ctx->compat) {
- struct compat_iovec __user *ciovs;
- struct compat_iovec ciov;
-
- ciovs = (struct compat_iovec __user *) arg;
- if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
- return -EFAULT;
-
- dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
- dst->iov_len = ciov.iov_len;
- return 0;
- }
-#endif
- src = (struct iovec __user *) arg;
- if (copy_from_user(dst, &src[index], sizeof(*dst)))
- return -EFAULT;
- return 0;
-}
-
static int io_buffer_validate(struct iovec *iov)
{
unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
@@ -249,6 +224,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
ret = io_run_task_work_sig(ctx);
if (ret < 0) {
+ finish_wait(&ctx->rsrc_quiesce_wq, &we);
mutex_lock(&ctx->uring_lock);
if (list_empty(&ctx->rsrc_ref_list))
ret = 0;
@@ -256,7 +232,6 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
}
schedule();
- __set_current_state(TASK_RUNNING);
mutex_lock(&ctx->uring_lock);
ret = 0;
} while (!list_empty(&ctx->rsrc_ref_list));
@@ -419,8 +394,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned int nr_args)
{
+ struct iovec __user *uvec = u64_to_user_ptr(up->data);
u64 __user *tags = u64_to_user_ptr(up->tags);
- struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+ struct iovec fast_iov, *iov;
struct page *last_hpage = NULL;
__u32 done;
int i, err;
@@ -434,21 +410,23 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
struct io_mapped_ubuf *imu;
u64 tag = 0;
- err = io_copy_iov(ctx, &iov, iovs, done);
- if (err)
+ iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat);
+ if (IS_ERR(iov)) {
+ err = PTR_ERR(iov);
break;
+ }
if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
err = -EFAULT;
break;
}
- err = io_buffer_validate(&iov);
+ err = io_buffer_validate(iov);
if (err)
break;
- if (!iov.iov_base && tag) {
+ if (!iov->iov_base && tag) {
err = -EINVAL;
break;
}
- err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
+ err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
if (err)
break;
@@ -970,8 +948,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
{
struct page *last_hpage = NULL;
struct io_rsrc_data *data;
+ struct iovec fast_iov, *iov = &fast_iov;
+ const struct iovec __user *uvec = (struct iovec * __user) arg;
int i, ret;
- struct iovec iov;
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
@@ -988,24 +967,27 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
+ if (!arg)
+ memset(iov, 0, sizeof(*iov));
+
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
if (arg) {
- ret = io_copy_iov(ctx, &iov, arg, i);
- if (ret)
+ iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat);
+ if (IS_ERR(iov)) {
+ ret = PTR_ERR(iov);
break;
- ret = io_buffer_validate(&iov);
+ }
+ ret = io_buffer_validate(iov);
if (ret)
break;
- } else {
- memset(&iov, 0, sizeof(iov));
}
- if (!iov.iov_base && *io_get_tag_slot(data, i)) {
+ if (!iov->iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL;
break;
}
- ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+ ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
&last_hpage);
if (ret)
break;
@@ -1067,7 +1049,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* branch doesn't expect non PAGE_SIZE'd chunks.
*/
iter->bvec = bvec;
- iter->nr_segs = bvec->bv_len;
iter->count -= offset;
iter->iov_offset = offset;
} else {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1a2128459cb4..c004d21e2f12 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -772,7 +772,7 @@ static bool need_complete_io(struct io_kiocb *req)
S_ISBLK(file_inode(req->file)->i_mode);
}
-static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
+static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct kiocb *kiocb = &rw->kiocb;
@@ -787,7 +787,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
req->flags |= io_file_get_flags(file);
kiocb->ki_flags = file->f_iocb_flags;
- ret = kiocb_set_rw_flags(kiocb, rw->flags);
+ ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
if (unlikely(ret))
return ret;
kiocb->ki_flags |= IOCB_ALLOC_CACHE;
@@ -832,8 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(ret < 0))
return ret;
}
-
- ret = io_rw_init_file(req, FMODE_READ);
+ ret = io_rw_init_file(req, FMODE_READ, READ);
if (unlikely(ret))
return ret;
req->cqe.res = iov_iter_count(&io->iter);
@@ -1013,7 +1012,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
ssize_t ret, ret2;
loff_t *ppos;
- ret = io_rw_init_file(req, FMODE_WRITE);
+ ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
if (unlikely(ret))
return ret;
req->cqe.res = iov_iter_count(&io->iter);
diff --git a/io_uring/statx.c b/io_uring/statx.c
index abb874209caa..f7f9b202eec0 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -37,8 +37,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sx->flags = READ_ONCE(sqe->statx_flags);
sx->filename = getname_flags(path,
- getname_statx_lookup_flags(sx->flags),
- NULL);
+ getname_statx_lookup_flags(sx->flags));
if (IS_ERR(sx->filename)) {
int ret = PTR_ERR(sx->filename);
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index 44905b82eea8..6cf41c3bc369 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -96,7 +96,7 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
- ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+ ix->filename = getname_flags(path, LOOKUP_FOLLOW);
if (IS_ERR(ix->filename)) {
ret = PTR_ERR(ix->filename);
ix->filename = NULL;
@@ -189,7 +189,7 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
- ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+ ix->filename = getname_flags(path, LOOKUP_FOLLOW);
if (IS_ERR(ix->filename)) {
ret = PTR_ERR(ix->filename);
ix->filename = NULL;