diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-11 14:19:23 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-11 14:19:23 -0800 |
commit | 4c72e2b8c42e57f65d8fbfb01329e79d2b450653 (patch) | |
tree | 93dda9fd8adf1c1948c8dd83809aff4f207c7ea5 /io_uring | |
parent | 01d550f0fcc06c7292f79a6f1453aac122d1d2c8 (diff) | |
parent | 6ff1407e24e6fdfa4a16ba9ba551e3d253a26391 (diff) |
Merge tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe:
"Mostly just come fixes and cleanups, but one feature as well. In
detail:
- Harden the check for handling IOPOLL based on return (Pavel)
- Various minor optimizations (Pavel)
- Drop remnants of SCM_RIGHTS fd passing support, now that it's no
longer supported since 6.7 (me)
- Fix for a case where bytes_done wasn't initialized properly on a
failure condition for read/write requests (me)
- Move the register related code to a separate file (me)
- Add support for returning the provided ring buffer head (me)
- Add support for adding a direct descriptor to the normal file table
(me, Christian Brauner)
- Fix for ensuring pending task_work for a ring with DEFER_TASKRUN is
run even if we timeout waiting (me)"
* tag 'for-6.8/io_uring-2024-01-08' of git://git.kernel.dk/linux:
io_uring: ensure local task_work is run on wait timeout
io_uring/kbuf: add method for returning provided buffer ring head
io_uring/rw: ensure io->bytes_done is always initialized
io_uring: drop any code related to SCM_RIGHTS
io_uring/unix: drop usage of io_uring socket
io_uring/register: move io_uring_register(2) related code to register.c
io_uring/openclose: add support for IORING_OP_FIXED_FD_INSTALL
io_uring/cmd: inline io_uring_cmd_get_task
io_uring/cmd: inline io_uring_cmd_do_in_task_lazy
io_uring: split out cmd api into a separate header
io_uring: optimise ltimeout for inline execution
io_uring: don't check iopoll if request completes
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/Makefile | 2 | ||||
-rw-r--r-- | io_uring/filetable.c | 11 | ||||
-rw-r--r-- | io_uring/io_uring.c | 663 | ||||
-rw-r--r-- | io_uring/io_uring.h | 19 | ||||
-rw-r--r-- | io_uring/kbuf.c | 26 | ||||
-rw-r--r-- | io_uring/kbuf.h | 1 | ||||
-rw-r--r-- | io_uring/opdef.c | 9 | ||||
-rw-r--r-- | io_uring/openclose.c | 44 | ||||
-rw-r--r-- | io_uring/openclose.h | 3 | ||||
-rw-r--r-- | io_uring/register.c | 605 | ||||
-rw-r--r-- | io_uring/register.h | 8 | ||||
-rw-r--r-- | io_uring/rsrc.c | 169 | ||||
-rw-r--r-- | io_uring/rsrc.h | 15 | ||||
-rw-r--r-- | io_uring/rw.c | 12 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 15 |
15 files changed, 752 insertions, 850 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index e5be47e4fc3b..2cdc51825405 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -8,6 +8,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o + notif.o waitid.o register.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/filetable.c b/io_uring/filetable.c index e7d749991de4..6e86e6188dbe 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -87,13 +87,10 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, io_file_bitmap_clear(&ctx->file_table, slot_index); } - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } - return ret; + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); + return 0; } int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9839016f82ea..09b6d860deba 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -60,7 +60,6 @@ #include <linux/net.h> #include <net/sock.h> #include <net/af_unix.h> -#include <net/scm.h> #include <linux/anon_inodes.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -70,6 +69,7 @@ #include <linux/fadvise.h> #include <linux/task_work.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> #include <asm/shmparam.h> @@ -85,6 +85,7 @@ #include "opdef.h" #include "refs.h" #include "tctx.h" +#include "register.h" #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" @@ -103,9 +104,6 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ - IORING_REGISTER_LAST + IORING_OP_LAST) - #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -129,11 +127,6 @@ enum { IO_CHECK_CQ_DROPPED_BIT, }; -enum { - IO_EVENTFD_OP_SIGNAL_BIT, - IO_EVENTFD_OP_FREE_BIT, -}; - struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -177,19 +170,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif -struct sock *io_uring_get_socket(struct file *file) -{ -#if defined(CONFIG_UNIX) - if (io_is_uring_fops(file)) { - struct io_ring_ctx *ctx = file->private_data; - - return ctx->ring_sock->sk; - } -#endif - return NULL; -} -EXPORT_SYMBOL(io_uring_get_socket); - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || @@ -554,8 +534,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } - -static void io_eventfd_ops(struct rcu_head *rcu) +void io_eventfd_ops(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); int ops = atomic_xchg(&ev_fd->ops, 0); @@ -1898,14 +1877,19 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) io_req_complete_defer(req); else io_req_complete_post(req, issue_flags); - } else if (ret != IOU_ISSUE_SKIP_COMPLETE) - return ret; - /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) - io_iopoll_req_issued(req, issue_flags); + return 0; + } - return 0; + if (ret == IOU_ISSUE_SKIP_COMPLETE) { + ret = 0; + io_arm_ltimeout(req); + + /* If the op doesn't have a file, we're not polling for it */ + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + io_iopoll_req_issued(req, issue_flags); + } + return ret; } int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) @@ -2076,9 +2060,7 @@ static inline void io_queue_sqe(struct io_kiocb *req) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (likely(!ret)) - io_arm_ltimeout(req); - else + if (unlikely(ret)) io_queue_async(req, ret); } @@ -2633,8 +2615,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, 0); - if (ret < 0) - break; /* * Run task_work after scheduling and before io_should_wake(). * If we got woken because of task_work being processed, run it @@ -2644,6 +2624,18 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx); + /* + * Non-local task_work will be run on exit to userspace, but + * if we're using DEFER_TASKRUN, then we could have waited + * with a timeout for a number of requests. If the timeout + * hits, we could have some requests ready to process. Ensure + * this break is _after_ we have run task_work, to avoid + * deferring running potentially pending requests until the + * next time we wait for events. + */ + if (ret < 0) + break; + check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { /* let the caller flush overflows, retry */ @@ -2831,61 +2823,6 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return off; } -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - - spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - atomic_set(&ev_fd->refs, 1); - atomic_set(&ev_fd->ops, 0); - return 0; -} - -static int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) - call_rcu(&ev_fd->rcu, io_eventfd_ops); - return 0; - } - - return -ENXIO; -} - static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; @@ -2938,13 +2875,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_rsrc_node_destroy(ctx, ctx->rsrc_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - ctx->ring_sock->file = NULL; /* so that iput() is called */ - sock_release(ctx->ring_sock); - } -#endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); @@ -2984,7 +2914,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb) percpu_ref_put(&ctx->refs); } -static __cold void io_activate_pollwq(struct io_ring_ctx *ctx) +__cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ @@ -3043,19 +2973,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) return mask; } -static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) -{ - const struct cred *creds; - - creds = xa_erase(&ctx->personalities, id); - if (creds) { - put_cred(creds); - return 0; - } - - return -EINVAL; -} - struct io_tctx_exit { struct callback_head task_work; struct completion completion; @@ -3866,32 +3783,12 @@ static int io_uring_install_fd(struct file *file) /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this - * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, - * we have to tie this fd to a socket for file garbage collection purposes. + * fd to gain access to the SQ/CQ ring details. */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { - struct file *file; -#if defined(CONFIG_UNIX) - int ret; - - ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, - &ctx->ring_sock); - if (ret) - return ERR_PTR(ret); -#endif - - file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, + return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC, NULL); -#if defined(CONFIG_UNIX) - if (IS_ERR(file)) { - sock_release(ctx->ring_sock); - ctx->ring_sock = NULL; - } else { - ctx->ring_sock->file = file; - } -#endif - return file; } static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, @@ -4158,506 +4055,6 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } -static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_probe *p; - size_t size; - int i, ret; - - size = struct_size(p, ops, nr_args); - if (size == SIZE_MAX) - return -EOVERFLOW; - p = kzalloc(size, GFP_KERNEL); - if (!p) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(p, arg, size)) - goto out; - ret = -EINVAL; - if (memchr_inv(p, 0, size)) - goto out; - - p->last_op = IORING_OP_LAST - 1; - if (nr_args > IORING_OP_LAST) - nr_args = IORING_OP_LAST; - - for (i = 0; i < nr_args; i++) { - p->ops[i].op = i; - if (!io_issue_defs[i].not_supported) - p->ops[i].flags = IO_URING_OP_SUPPORTED; - } - p->ops_len = i; - - ret = 0; - if (copy_to_user(arg, p, size)) - ret = -EFAULT; -out: - kfree(p); - return ret; -} - -static int io_register_personality(struct io_ring_ctx *ctx) -{ - const struct cred *creds; - u32 id; - int ret; - - creds = get_current_cred(); - - ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, - XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); - if (ret < 0) { - put_cred(creds); - return ret; - } - return id; -} - -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) -{ - struct io_uring_restriction *res; - size_t size; - int i, ret; - - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) - return -EINVAL; - - size = array_size(nr_args, sizeof(*res)); - if (size == SIZE_MAX) - return -EOVERFLOW; - - res = memdup_user(arg, size); - if (IS_ERR(res)) - return PTR_ERR(res); - - ret = 0; - - for (i = 0; i < nr_args; i++) { - switch (res[i].opcode) { - case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); - break; - case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); - break; - case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; - break; - case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; - break; - default: - ret = -EINVAL; - goto out; - } - } - -out: - /* Reset all restrictions if an error happened */ - if (ret != 0) - memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); - else - ctx->restrictions.registered = true; - - kfree(res); - return ret; -} - -static int io_register_enable_rings(struct io_ring_ctx *ctx) -{ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); - /* - * Lazy activation attempts would fail if it was polled before - * submitter_task is set. - */ - if (wq_has_sleeper(&ctx->poll_wq)) - io_activate_pollwq(ctx); - } - - if (ctx->restrictions.registered) - ctx->restricted = 1; - - ctx->flags &= ~IORING_SETUP_R_DISABLED; - if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); - return 0; -} - -static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, - cpumask_var_t new_mask) -{ - int ret; - - if (!(ctx->flags & IORING_SETUP_SQPOLL)) { - ret = io_wq_cpu_affinity(current->io_uring, new_mask); - } else { - mutex_unlock(&ctx->uring_lock); - ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); - mutex_lock(&ctx->uring_lock); - } - - return ret; -} - -static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, - void __user *arg, unsigned len) -{ - cpumask_var_t new_mask; - int ret; - - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_clear(new_mask); - if (len > cpumask_size()) - len = cpumask_size(); - - if (in_compat_syscall()) { - ret = compat_get_bitmap(cpumask_bits(new_mask), - (const compat_ulong_t __user *)arg, - len * 8 /* CHAR_BIT */); - } else { - ret = copy_from_user(new_mask, arg, len); - } - - if (ret) { - free_cpumask_var(new_mask); - return -EFAULT; - } - - ret = __io_register_iowq_aff(ctx, new_mask); - free_cpumask_var(new_mask); - return ret; -} - -static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) -{ - return __io_register_iowq_aff(ctx, NULL); -} - -static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, - void __user *arg) - __must_hold(&ctx->uring_lock) -{ - struct io_tctx_node *node; - struct io_uring_task *tctx = NULL; - struct io_sq_data *sqd = NULL; - __u32 new_count[2]; - int i, ret; - - if (copy_from_user(new_count, arg, sizeof(new_count))) - return -EFAULT; - for (i = 0; i < ARRAY_SIZE(new_count); i++) - if (new_count[i] > INT_MAX) - return -EINVAL; - - if (ctx->flags & IORING_SETUP_SQPOLL) { - sqd = ctx->sq_data; - if (sqd) { - /* - * Observe the correct sqd->lock -> ctx->uring_lock - * ordering. Fine to drop uring_lock here, we hold - * a ref to the ctx. - */ - refcount_inc(&sqd->refs); - mutex_unlock(&ctx->uring_lock); - mutex_lock(&sqd->lock); - mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; - } - } else { - tctx = current->io_uring; - } - - BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); - - for (i = 0; i < ARRAY_SIZE(new_count); i++) - if (new_count[i]) - ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; - - if (tctx && tctx->io_wq) { - ret = io_wq_max_workers(tctx->io_wq, new_count); - if (ret) - goto err; - } else { - memset(new_count, 0, sizeof(new_count)); - } - - if (sqd) { - mutex_unlock(&sqd->lock); - io_put_sq_data(sqd); - } - - if (copy_to_user(arg, new_count, sizeof(new_count))) - return -EFAULT; - - /* that's it for SQPOLL, only the SQPOLL task creates requests */ - if (sqd) - return 0; - - /* now propagate the restriction to all registered users */ - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - if (WARN_ON_ONCE(!tctx->io_wq)) - continue; - - for (i = 0; i < ARRAY_SIZE(new_count); i++) - new_count[i] = ctx->iowq_limits[i]; - /* ignore errors, it always returns zero anyway */ - (void)io_wq_max_workers(tctx->io_wq, new_count); - } - return 0; -err: - if (sqd) { - mutex_unlock(&sqd->lock); - io_put_sq_data(sqd); - } - return ret; -} - -static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, - void __user *arg, unsigned nr_args) - __releases(ctx->uring_lock) - __acquires(ctx->uring_lock) -{ - int ret; - - /* - * We don't quiesce the refs for register anymore and so it can't be - * dying as we're holding a file ref here. - */ - if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) - return -ENXIO; - - if (ctx->submitter_task && ctx->submitter_task != current) - return -EEXIST; - - if (ctx->restricted) { - opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); - if (!test_bit(opcode, ctx->restrictions.register_op)) - return -EACCES; - } - - switch (opcode) { - case IORING_REGISTER_BUFFERS: - ret = -EFAULT; - if (!arg) - break; - ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); - break; - case IORING_UNREGISTER_BUFFERS: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_sqe_buffers_unregister(ctx); - break; - case IORING_REGISTER_FILES: - ret = -EFAULT; - if (!arg) - break; - ret = io_sqe_files_register(ctx, arg, nr_args, NULL); - break; - case IORING_UNREGISTER_FILES: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_sqe_files_unregister(ctx); - break; - case IORING_REGISTER_FILES_UPDATE: - ret = io_register_files_update(ctx, arg, nr_args); - break; - case IORING_REGISTER_EVENTFD: - ret = -EINVAL; - if (nr_args != 1) - break; - ret = io_eventfd_register(ctx, arg, 0); - break; - case IORING_REGISTER_EVENTFD_ASYNC: - ret = -EINVAL; - if (nr_args != 1) - break; - ret = io_eventfd_register(ctx, arg, 1); - break; - case IORING_UNREGISTER_EVENTFD: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_eventfd_unregister(ctx); - break; - case IORING_REGISTER_PROBE: - ret = -EINVAL; - if (!arg || nr_args > 256) - break; - ret = io_probe(ctx, arg, nr_args); - break; - case IORING_REGISTER_PERSONALITY: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_register_personality(ctx); - break; - case IORING_UNREGISTER_PERSONALITY: - ret = -EINVAL; - if (arg) - break; - ret = io_unregister_personality(ctx, nr_args); - break; - case IORING_REGISTER_ENABLE_RINGS: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_register_enable_rings(ctx); - break; - case IORING_REGISTER_RESTRICTIONS: - ret = io_register_restrictions(ctx, arg, nr_args); - break; - case IORING_REGISTER_FILES2: - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); - break; - case IORING_REGISTER_FILES_UPDATE2: - ret = io_register_rsrc_update(ctx, arg, nr_args, - IORING_RSRC_FILE); - break; - case IORING_REGISTER_BUFFERS2: - ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); - break; - case IORING_REGISTER_BUFFERS_UPDATE: - ret = io_register_rsrc_update(ctx, arg, nr_args, - IORING_RSRC_BUFFER); - break; - case IORING_REGISTER_IOWQ_AFF: - ret = -EINVAL; - if (!arg || !nr_args) - break; - ret = io_register_iowq_aff(ctx, arg, nr_args); - break; - case IORING_UNREGISTER_IOWQ_AFF: - ret = -EINVAL; - if (arg || nr_args) - break; - ret = io_unregister_iowq_aff(ctx); - break; - case IORING_REGISTER_IOWQ_MAX_WORKERS: - ret = -EINVAL; - if (!arg || nr_args != 2) - break; - ret = io_register_iowq_max_workers(ctx, arg); - break; - case IORING_REGISTER_RING_FDS: - ret = io_ringfd_register(ctx, arg, nr_args); - break; - case IORING_UNREGISTER_RING_FDS: - ret = io_ringfd_unregister(ctx, arg, nr_args); - break; - case IORING_REGISTER_PBUF_RING: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_register_pbuf_ring(ctx, arg); - break; - case IORING_UNREGISTER_PBUF_RING: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_unregister_pbuf_ring(ctx, arg); - break; - case IORING_REGISTER_SYNC_CANCEL: - ret = -EINVAL; - if (!arg || nr_args != 1) - break; - ret = io_sync_cancel(ctx, arg); - break; - case IORING_REGISTER_FILE_ALLOC_RANGE: - ret = -EINVAL; - if (!arg || nr_args) - break; - ret = io_register_file_alloc_range(ctx, arg); - break; - default: - ret = -EINVAL; - break; - } - - return ret; -} - -SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, - void __user *, arg, unsigned int, nr_args) -{ - struct io_ring_ctx *ctx; - long ret = -EBADF; - struct file *file; - bool use_registered_ring; - - use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); - opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; - - if (opcode >= IORING_REGISTER_LAST) - return -EINVAL; - - if (use_registered_ring) { - /* - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we - * need only dereference our task private array to find it. - */ - struct io_uring_task *tctx = current->io_uring; - - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) - return -EINVAL; - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); - file = tctx->registered_rings[fd]; - if (unlikely(!file)) - return -EBADF; - } else { - file = fget(fd); - if (unlikely(!file)) - return -EBADF; - ret = -EOPNOTSUPP; - if (!io_is_uring_fops(file)) - goto out_fput; - } - - ctx = file->private_data; - - mutex_lock(&ctx->uring_lock); - ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); -out_fput: - if (!use_registered_ring) - fput(file); - return ret; -} - static int __init io_uring_init(void) { #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ed84f2737b3a..04e33f25919c 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -15,16 +15,6 @@ #include <trace/events/io_uring.h> #endif -enum { - /* - * A hint to not wake right away but delay until there are enough of - * tw's queued to match the number of CQEs the task is waiting for. - * - * Must not be used wirh requests generating more than one CQE. - * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. - */ - IOU_F_TWQ_LAZY_WAKE = 1, -}; enum { IOU_OK = 0, @@ -54,7 +44,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); -bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); @@ -89,6 +78,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, void *io_mem_alloc(size_t size); void io_mem_free(void *ptr); +enum { + IO_EVENTFD_OP_SIGNAL_BIT, + IO_EVENTFD_OP_FREE_BIT, +}; + +void io_eventfd_ops(struct rcu_head *rcu); +void io_activate_pollwq(struct io_ring_ctx *ctx); + #if defined(CONFIG_PROVE_LOCKING) static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 72b6af1d2ed3..18df5a9d2f5e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -750,6 +750,32 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; } +int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_status buf_status; + struct io_buffer_list *bl; + int i; + + if (copy_from_user(&buf_status, arg, sizeof(buf_status))) + return -EFAULT; + + for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) + if (buf_status.resv[i]) + return -EINVAL; + + bl = io_buffer_get_list(ctx, buf_status.buf_group); + if (!bl) + return -ENOENT; + if (!bl->is_mapped) + return -EINVAL; + + buf_status.head = bl->head; + if (copy_to_user(arg, &buf_status, sizeof(buf_status))) + return -EFAULT; + + return 0; +} + void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 9be5960817ea..53dfaa71a397 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -53,6 +53,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 799db44283c7..6705634e5f52 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -469,6 +469,12 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_FIXED_FD_INSTALL] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_install_fixed_fd_prep, + .issue = io_install_fixed_fd, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -704,6 +710,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FUTEX_WAITV] = { .name = "FUTEX_WAITV", }, + [IORING_OP_FIXED_FD_INSTALL] = { + .name = "FIXED_FD_INSTALL", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 74fc22461f48..0fe0dd305546 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -31,6 +31,11 @@ struct io_close { u32 file_slot; }; +struct io_fixed_install { + struct file *file; + unsigned int o_flags; +}; + static bool io_openat_force_async(struct io_open *open) { /* @@ -254,3 +259,42 @@ err: io_req_set_res(req, ret, 0); return IOU_OK; } + +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_fixed_install *ifi; + unsigned int flags; + + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || + sqe->splice_fd_in || sqe->addr3) + return -EINVAL; + + /* must be a fixed file */ + if (!(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + flags = READ_ONCE(sqe->install_fd_flags); + if (flags & ~IORING_FIXED_FD_NO_CLOEXEC) + return -EINVAL; + + /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */ + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); + ifi->o_flags = O_CLOEXEC; + if (flags & IORING_FIXED_FD_NO_CLOEXEC) + ifi->o_flags = 0; + + return 0; +} + +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_fixed_install *ifi; + int ret; + + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); + ret = receive_fd(req->file, NULL, ifi->o_flags); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 4b1c28d3a66c..8a93c98ad0ad 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -12,3 +12,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); + +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/register.c b/io_uring/register.c new file mode 100644 index 000000000000..708dd1d89add --- /dev/null +++ b/io_uring/register.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code related to the io_uring_register() syscall + * + * Copyright (C) 2023 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/syscalls.h> +#include <linux/refcount.h> +#include <linux/bits.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/nospec.h> +#include <linux/io_uring.h> +#include <linux/io_uring_types.h> + +#include "io_uring.h" +#include "opdef.h" +#include "tctx.h" +#include "rsrc.h" +#include "sqpoll.h" +#include "register.h" +#include "cancel.h" +#include "kbuf.h" + +#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ + IORING_REGISTER_LAST + IORING_OP_LAST) + +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) +{ + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; + + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; + + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; + + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); + return ret; + } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + atomic_set(&ev_fd->refs, 1); + atomic_set(&ev_fd->ops, 0); + return 0; +} + +int io_eventfd_unregister(struct io_ring_ctx *ctx) +{ + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) + call_rcu(&ev_fd->rcu, io_eventfd_ops); + return 0; + } + + return -ENXIO; +} + +static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_probe *p; + size_t size; + int i, ret; + + size = struct_size(p, ops, nr_args); + if (size == SIZE_MAX) + return -EOVERFLOW; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(p, arg, size)) + goto out; + ret = -EINVAL; + if (memchr_inv(p, 0, size)) + goto out; + + p->last_op = IORING_OP_LAST - 1; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + + for (i = 0; i < nr_args; i++) { + p->ops[i].op = i; + if (!io_issue_defs[i].not_supported) + p->ops[i].flags = IO_URING_OP_SUPPORTED; + } + p->ops_len = i; + + ret = 0; + if (copy_to_user(arg, p, size)) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + +int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) +{ + const struct cred *creds; + + creds = xa_erase(&ctx->personalities, id); + if (creds) { + put_cred(creds); + return 0; + } + + return -EINVAL; +} + + +static int io_register_personality(struct io_ring_ctx *ctx) +{ + const struct cred *creds; + u32 id; + int ret; + + creds = get_current_cred(); + + ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, + XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); + if (ret < 0) { + put_cred(creds); + return ret; + } + return id; +} + +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) +{ + struct io_uring_restriction *res; + size_t size; + int i, ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + if (!arg || nr_args > IORING_MAX_RESTRICTIONS) + return -EINVAL; + + size = array_size(nr_args, sizeof(*res)); + if (size == SIZE_MAX) + return -EOVERFLOW; + + res = memdup_user(arg, size); + if (IS_ERR(res)) + return PTR_ERR(res); + + ret = 0; + + for (i = 0; i < nr_args; i++) { + switch (res[i].opcode) { + case IORING_RESTRICTION_REGISTER_OP: + if (res[i].register_op >= IORING_REGISTER_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].register_op, + ctx->restrictions.register_op); + break; + case IORING_RESTRICTION_SQE_OP: + if (res[i].sqe_op >= IORING_OP_LAST) { + ret = -EINVAL; + goto out; + } + + __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + break; + case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: + ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + break; + case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: + ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + break; + default: + ret = -EINVAL; + goto out; + } + } + +out: + /* Reset all restrictions if an error happened */ + if (ret != 0) + memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); + else + ctx->restrictions.registered = true; + + kfree(res); + return ret; +} + +static int io_register_enable_rings(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { + WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + /* + * Lazy activation attempts would fail if it was polled before + * submitter_task is set. + */ + if (wq_has_sleeper(&ctx->poll_wq)) + io_activate_pollwq(ctx); + } + + if (ctx->restrictions.registered) + ctx->restricted = 1; + + ctx->flags &= ~IORING_SETUP_R_DISABLED; + if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); + return 0; +} + +static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, + cpumask_var_t new_mask) +{ + int ret; + + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { + ret = io_wq_cpu_affinity(current->io_uring, new_mask); + } else { + mutex_unlock(&ctx->uring_lock); + ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); + mutex_lock(&ctx->uring_lock); + } + + return ret; +} + +static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, + void __user *arg, unsigned len) +{ + cpumask_var_t new_mask; + int ret; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_clear(new_mask); + if (len > cpumask_size()) + len = cpumask_size(); + + if (in_compat_syscall()) { + ret = compat_get_bitmap(cpumask_bits(new_mask), + (const compat_ulong_t __user *)arg, + len * 8 /* CHAR_BIT */); + } else { + ret = copy_from_user(new_mask, arg, len); + } + + if (ret) { + free_cpumask_var(new_mask); + return -EFAULT; + } + + ret = __io_register_iowq_aff(ctx, new_mask); + free_cpumask_var(new_mask); + return ret; +} + +static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) +{ + return __io_register_iowq_aff(ctx, NULL); +} + +static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) + __must_hold(&ctx->uring_lock) +{ + struct io_tctx_node *node; + struct io_uring_task *tctx = NULL; + struct io_sq_data *sqd = NULL; + __u32 new_count[2]; + int i, ret; + + if (copy_from_user(new_count, arg, sizeof(new_count))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(new_count); i++) + if (new_count[i] > INT_MAX) + return -EINVAL; + + if (ctx->flags & IORING_SETUP_SQPOLL) { + sqd = ctx->sq_data; + if (sqd) { + /* + * Observe the correct sqd->lock -> ctx->uring_lock + * ordering. Fine to drop uring_lock here, we hold + * a ref to the ctx. + */ + refcount_inc(&sqd->refs); + mutex_unlock(&ctx->uring_lock); + mutex_lock(&sqd->lock); + mutex_lock(&ctx->uring_lock); + if (sqd->thread) + tctx = sqd->thread->io_uring; + } + } else { + tctx = current->io_uring; + } + + BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); + + for (i = 0; i < ARRAY_SIZE(new_count); i++) + if (new_count[i]) + ctx->iowq_limits[i] = new_count[i]; + ctx->iowq_limits_set = true; + + if (tctx && tctx->io_wq) { + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + goto err; + } else { + memset(new_count, 0, sizeof(new_count)); + } + + if (sqd) { + mutex_unlock(&sqd->lock); + io_put_sq_data(sqd); + } + + if (copy_to_user(arg, new_count, sizeof(new_count))) + return -EFAULT; + + /* that's it for SQPOLL, only the SQPOLL task creates requests */ + if (sqd) + return 0; + + /* now propagate the restriction to all registered users */ + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + if (WARN_ON_ONCE(!tctx->io_wq)) + continue; + + for (i = 0; i < ARRAY_SIZE(new_count); i++) + new_count[i] = ctx->iowq_limits[i]; + /* ignore errors, it always returns zero anyway */ + (void)io_wq_max_workers(tctx->io_wq, new_count); + } + return 0; +err: + if (sqd) { + mutex_unlock(&sqd->lock); + io_put_sq_data(sqd); + } + return ret; +} + +static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, + void __user *arg, unsigned nr_args) + __releases(ctx->uring_lock) + __acquires(ctx->uring_lock) +{ + int ret; + + /* + * We don't quiesce the refs for register anymore and so it can't be + * dying as we're holding a file ref here. + */ + if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) + return -ENXIO; + + if (ctx->submitter_task && ctx->submitter_task != current) + return -EEXIST; + + if (ctx->restricted) { + opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); + if (!test_bit(opcode, ctx->restrictions.register_op)) + return -EACCES; + } + + switch (opcode) { + case IORING_REGISTER_BUFFERS: + ret = -EFAULT; + if (!arg) + break; + ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); + break; + case IORING_UNREGISTER_BUFFERS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_buffers_unregister(ctx); + break; + case IORING_REGISTER_FILES: + ret = -EFAULT; + if (!arg) + break; + ret = io_sqe_files_register(ctx, arg, nr_args, NULL); + break; + case IORING_UNREGISTER_FILES: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_files_unregister(ctx); + break; + case IORING_REGISTER_FILES_UPDATE: + ret = io_register_files_update(ctx, arg, nr_args); + break; + case IORING_REGISTER_EVENTFD: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_eventfd_register(ctx, arg, 0); + break; + case IORING_REGISTER_EVENTFD_ASYNC: + ret = -EINVAL; + if (nr_args != 1) + break; + ret = io_eventfd_register(ctx, arg, 1); + break; + case IORING_UNREGISTER_EVENTFD: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_eventfd_unregister(ctx); + break; + case IORING_REGISTER_PROBE: + ret = -EINVAL; + if (!arg || nr_args > 256) + break; + ret = io_probe(ctx, arg, nr_args); + break; + case IORING_REGISTER_PERSONALITY: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_personality(ctx); + break; + case IORING_UNREGISTER_PERSONALITY: + ret = -EINVAL; + if (arg) + break; + ret = io_unregister_personality(ctx, nr_args); + break; + case IORING_REGISTER_ENABLE_RINGS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_enable_rings(ctx); + break; + case IORING_REGISTER_RESTRICTIONS: + ret = io_register_restrictions(ctx, arg, nr_args); + break; + case IORING_REGISTER_FILES2: + ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); + break; + case IORING_REGISTER_FILES_UPDATE2: + ret = io_register_rsrc_update(ctx, arg, nr_args, + IORING_RSRC_FILE); + break; + case IORING_REGISTER_BUFFERS2: + ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); + break; + case IORING_REGISTER_BUFFERS_UPDATE: + ret = io_register_rsrc_update(ctx, arg, nr_args, + IORING_RSRC_BUFFER); + break; + case IORING_REGISTER_IOWQ_AFF: + ret = -EINVAL; + if (!arg || !nr_args) + break; + ret = io_register_iowq_aff(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_IOWQ_AFF: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_unregister_iowq_aff(ctx); + break; + case IORING_REGISTER_IOWQ_MAX_WORKERS: + ret = -EINVAL; + if (!arg || nr_args != 2) + break; + ret = io_register_iowq_max_workers(ctx, arg); + break; + case IORING_REGISTER_RING_FDS: + ret = io_ringfd_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_RING_FDS: + ret = io_ringfd_unregister(ctx, arg, nr_args); + break; + case IORING_REGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_pbuf_ring(ctx, arg); + break; + case IORING_UNREGISTER_PBUF_RING: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_unregister_pbuf_ring(ctx, arg); + break; + case IORING_REGISTER_SYNC_CANCEL: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_sync_cancel(ctx, arg); + break; + case IORING_REGISTER_FILE_ALLOC_RANGE: + ret = -EINVAL; + if (!arg || nr_args) + break; + ret = io_register_file_alloc_range(ctx, arg); + break; + case IORING_REGISTER_PBUF_STATUS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_pbuf_status(ctx, arg); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, + void __user *, arg, unsigned int, nr_args) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + struct file *file; + bool use_registered_ring; + + use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); + opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; + + if (opcode >= IORING_REGISTER_LAST) + return -EINVAL; + + if (use_registered_ring) { + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ + struct io_uring_task *tctx = current->io_uring; + + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) + return -EINVAL; + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); + file = tctx->registered_rings[fd]; + if (unlikely(!file)) + return -EBADF; + } else { + file = fget(fd); + if (unlikely(!file)) + return -EBADF; + ret = -EOPNOTSUPP; + if (!io_is_uring_fops(file)) + goto out_fput; + } + + ctx = file->private_data; + + mutex_lock(&ctx->uring_lock); + ret = __io_uring_register(ctx, opcode, arg, nr_args); + mutex_unlock(&ctx->uring_lock); + trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); +out_fput: + if (!use_registered_ring) + fput(file); + return ret; +} diff --git a/io_uring/register.h b/io_uring/register.h new file mode 100644 index 000000000000..c9da997d503c --- /dev/null +++ b/io_uring/register.h @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IORING_REGISTER_H +#define IORING_REGISTER_H + +int io_eventfd_unregister(struct io_ring_ctx *ctx); +int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); + +#endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f521c5965a93..4818b79231dd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -24,7 +24,6 @@ struct io_rsrc_update { }; static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); @@ -157,7 +156,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node) switch (node->type) { case IORING_RSRC_FILE: - io_rsrc_file_put(node->ctx, prsrc); + fput(prsrc->file); break; case IORING_RSRC_BUFFER: io_rsrc_buf_put(node->ctx, prsrc); @@ -402,23 +401,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, break; } /* - * Don't allow io_uring instances to be registered. If - * UNIX isn't enabled, then this causes a reference - * cycle and this instance can never get freed. If UNIX - * is enabled we'll handle it just fine, but there's - * still no point in allowing a ring fd as it doesn't - * support regular read/write anyway. + * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; } - err = io_scm_file_account(ctx, file); - if (err) { - fput(file); - break; - } *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); @@ -675,22 +664,12 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx) for (i = 0; i < ctx->nr_user_files; i++) { struct file *file = io_file_from_index(&ctx->file_table, i); - /* skip scm accounted files, they'll be freed by ->ring_sock */ - if (!file || io_file_need_scm(file)) + if (!file) continue; io_file_bitmap_clear(&ctx->file_table, i); fput(file); } -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) - kfree_skb(skb); - } -#endif io_free_file_tables(&ctx->file_table); io_file_table_set_alloc_range(ctx, 0, 0); io_rsrc_data_free(ctx->file_data); @@ -718,137 +697,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx) return ret; } -/* - * Ensure the UNIX gc is aware of our file set, so we are certain that - * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. We account only files that can hold other - * files because otherwise they can't form a loop and so are not interesting - * for GC. - */ -int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) -{ -#if defined(CONFIG_UNIX) - struct sock *sk = ctx->ring_sock->sk; - struct sk_buff_head *head = &sk->sk_receive_queue; - struct scm_fp_list *fpl; - struct sk_buff *skb; - - if (likely(!io_file_need_scm(file))) - return 0; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) - __skb_unlink(skb, head); - else - skb = NULL; - spin_unlock_irq(&head->lock); - - if (!skb) { - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } - - fpl->user = get_uid(current_user()); - fpl->max = SCM_MAX_FD; - fpl->count = 0; - - UNIXCB(skb).fp = fpl; - skb->sk = sk; - skb->destructor = io_uring_destruct_scm; - refcount_add(skb->truesize, &sk->sk_wmem_alloc); - } - - fpl = UNIXCB(skb).fp; - fpl->fp[fpl->count++] = get_file(file); - unix_inflight(fpl->user, file); - skb_queue_head(head, skb); - fput(file); -#endif - return 0; -} - -static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) -{ -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#endif -} - -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - struct file *file = prsrc->file; - - if (likely(!io_file_need_scm(file))) - fput(file); - else - io_rsrc_file_scm_put(ctx, file); -} - int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { @@ -897,21 +745,12 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, goto fail; /* - * Don't allow io_uring instances to be registered. If UNIX - * isn't enabled, then this causes a reference cycle and this - * instance can never get freed. If UNIX is enabled we'll - * handle it just fine, but there's still no point in allowing - * a ring fd as it doesn't support regular read/write anyway. + * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); goto fail; } - ret = io_scm_file_account(ctx, file); - if (ret) { - fput(file); - goto fail; - } file_slot = io_fixed_file_slot(&ctx->file_table, i); io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 08ac0d8e07ef..7238b9cfe33b 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -75,21 +75,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx); int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags); -int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file); - -static inline bool io_file_need_scm(struct file *filp) -{ - return false; -} - -static inline int io_scm_file_account(struct io_ring_ctx *ctx, - struct file *file) -{ - if (likely(!io_file_need_scm(file))) - return 0; - return __io_scm_file_account(ctx, file); -} - int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/rw.c b/io_uring/rw.c index 64390d4e20c1..0c856726b15d 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -10,7 +10,7 @@ #include <linux/poll.h> #include <linux/nospec.h> #include <linux/compat.h> -#include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <uapi/linux/io_uring.h> @@ -589,15 +589,19 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) struct iovec *iov; int ret; + iorw->bytes_done = 0; + iorw->free_iovec = NULL; + /* submission path, ->uring_lock should already be taken */ ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); if (unlikely(ret < 0)) return ret; - iorw->bytes_done = 0; - iorw->free_iovec = iov; - if (iov) + if (iov) { + iorw->free_iovec = iov; req->flags |= REQ_F_NEED_CLEANUP; + } + return 0; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 8958666a379a..c33fca585dde 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -2,7 +2,7 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> -#include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <linux/security.h> #include <linux/nospec.h> @@ -52,12 +52,6 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) -{ - return cmd_to_io_kiocb(cmd)->task; -} -EXPORT_SYMBOL_GPL(io_uring_cmd_get_task); - static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -78,13 +72,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, } EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); -void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy); - static inline void io_req_set_cqe32_extra(struct io_kiocb *req, u64 extra1, u64 extra2) { |