diff options
author | Jens Axboe <axboe@kernel.dk> | 2021-06-17 10:19:54 -0600 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2021-06-17 10:25:50 -0600 |
commit | fe76421d1da1dcdb3a2cd8428ac40106bff28bc0 (patch) | |
tree | dc590c0c33f23e65a0e11cceb7be870944d0360a /fs | |
parent | 0e03496d1967abf1ebb151a24318c07d07f41f7f (diff) |
io_uring: allow user configurable IO thread CPU affinity
io-wq defaults to per-node masks for IO workers. This works fine by
default, but isn't particularly handy for workloads that prefer more
specific affinities, for either performance or isolation reasons.
This adds IORING_REGISTER_IOWQ_AFF that allows the user to pass in a CPU
mask that is then applied to IO thread workers, and an
IORING_UNREGISTER_IOWQ_AFF that simply resets the masks back to the
default of per-node.
Note that no care is given to existing IO threads, they will need to go
through a reschedule before the affinity is correct if they are already
running or sleeping.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/io-wq.c | 17 | ||||
-rw-r--r-- | fs/io-wq.h | 2 | ||||
-rw-r--r-- | fs/io_uring.c | 51 |
3 files changed, 70 insertions, 0 deletions
diff --git a/fs/io-wq.c b/fs/io-wq.c index 2af8e1df4646..bb4d3ee9592e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1087,6 +1087,23 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) return __io_wq_cpu_online(wq, cpu, false); } +int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) +{ + int i; + + rcu_read_lock(); + for_each_node(i) { + struct io_wqe *wqe = wq->wqes[i]; + + if (mask) + cpumask_copy(wqe->cpu_mask, mask); + else + cpumask_copy(wqe->cpu_mask, cpumask_of_node(i)); + } + rcu_read_unlock(); + return 0; +} + static __init int io_wq_init(void) { int ret; diff --git a/fs/io-wq.h b/fs/io-wq.h index af2df0680ee2..02299cdcf55c 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -128,6 +128,8 @@ void io_wq_put_and_exit(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); +int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); + static inline bool io_wq_is_hashed(struct io_wq_work *work) { return work->flags & IO_WQ_WORK_HASHED; diff --git a/fs/io_uring.c b/fs/io_uring.c index d916eb2cef09..46a25a7cb70a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9983,6 +9983,43 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; } +static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, + unsigned len) +{ + struct io_uring_task *tctx = current->io_uring; + cpumask_var_t new_mask; + int ret; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_clear(new_mask); + if (len > cpumask_size()) + len = cpumask_size(); + + if (copy_from_user(new_mask, arg, len)) { + free_cpumask_var(new_mask); + return -EFAULT; + } + + ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); + free_cpumask_var(new_mask); + return ret; +} + +static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx = current->io_uring; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + + return io_wq_cpu_affinity(tctx->io_wq, NULL); +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -9998,6 +10035,8 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_FILES_UPDATE2: case IORING_REGISTER_BUFFERS2: case IORING_REGISTER_BUFFERS_UPDATE: + case IORING_REGISTER_IOWQ_AFF: + case IORING_UNREGISTER_IOWQ_AFF: return false; default: return true; @@ -10137,6 +10176,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_register_rsrc_update(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; + case IORING_REGISTER_IOWQ_AFF: + ret = -EINVAL; + if (!arg || !nr_args) + break; + ret = io_register_iowq_aff(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_IOWQ_AFF: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_unregister_iowq_aff(ctx); + break; default: ret = -EINVAL; break; |