diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-29 18:53:37 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-01-29 18:53:37 -0800 |
commit | 896f8d23d0cb5889021d66eab6107e97109c5459 (patch) | |
tree | f8ceade3203209679ece27d3cace410178dfc99c /include | |
parent | 33c84e89abe4a92ab699c33029bd54269d574782 (diff) | |
parent | 3e4827b05d2ac2d377ed136a52829ec46787bf4b (diff) |
Merge tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe:
- Support for various new opcodes (fallocate, openat, close, statx,
fadvise, madvise, openat2, non-vectored read/write, send/recv, and
epoll_ctl)
- Faster ring quiesce for fileset updates
- Optimizations for overflow condition checking
- Support for max-sized clamping
- Support for probing what opcodes are supported
- Support for io-wq backend sharing between "sibling" rings
- Support for registering personalities
- Lots of little fixes and improvements
* tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block: (64 commits)
io_uring: add support for epoll_ctl(2)
eventpoll: support non-blocking do_epoll_ctl() calls
eventpoll: abstract out epoll_ctl() handler
io_uring: fix linked command file table usage
io_uring: support using a registered personality for commands
io_uring: allow registering credentials
io_uring: add io-wq workqueue sharing
io-wq: allow grabbing existing io-wq
io_uring/io-wq: don't use static creds/mm assignments
io-wq: make the io_wq ref counted
io_uring: fix refcounting with batched allocations at OOM
io_uring: add comment for drain_next
io_uring: don't attempt to copy iovec for READ/WRITE
io_uring: honor IOSQE_ASYNC for linked reqs
io_uring: prep req when do IOSQE_ASYNC
io_uring: use labeled array init in io_op_defs
io_uring: optimise sqe-to-req flags translation
io_uring: remove REQ_F_IO_DRAINED
io_uring: file switch work needs to get flushed on exit
io_uring: hide uring_fd in ctx
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/eventpoll.h | 9 | ||||
-rw-r--r-- | include/linux/mm.h | 1 | ||||
-rw-r--r-- | include/linux/percpu-refcount.h | 26 | ||||
-rw-r--r-- | include/trace/events/io_uring.h | 13 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 73 |
5 files changed, 107 insertions, 15 deletions
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index bc6d79b00c4e..8f000fada5a4 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file) eventpoll_release_file(file); } +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + bool nonblock); + +/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ +static inline int ep_op_has_event(int op) +{ + return op != EPOLL_CTL_DEL; +} + #else static inline void eventpoll_init_file(struct file *file) {} diff --git a/include/linux/mm.h b/include/linux/mm.h index 67f8451b9a12..1233bf45164d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2323,6 +2323,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); +extern int do_madvise(unsigned long start, size_t len_in, int behavior); static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 390031e816dc..22d9d183950d 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -210,15 +210,17 @@ static inline void percpu_ref_get(struct percpu_ref *ref) } /** - * percpu_ref_tryget - try to increment a percpu refcount + * percpu_ref_tryget_many - try to increment a percpu refcount * @ref: percpu_ref to try-get + * @nr: number of references to get * - * Increment a percpu refcount unless its count already reached zero. + * Increment a percpu refcount by @nr unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ -static inline bool percpu_ref_tryget(struct percpu_ref *ref) +static inline bool percpu_ref_tryget_many(struct percpu_ref *ref, + unsigned long nr) { unsigned long __percpu *percpu_count; bool ret; @@ -226,10 +228,10 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) { - this_cpu_inc(*percpu_count); + this_cpu_add(*percpu_count, nr); ret = true; } else { - ret = atomic_long_inc_not_zero(&ref->count); + ret = atomic_long_add_unless(&ref->count, nr, 0); } rcu_read_unlock(); @@ -238,6 +240,20 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) } /** + * percpu_ref_tryget - try to increment a percpu refcount + * @ref: percpu_ref to try-get + * + * Increment a percpu refcount unless its count already reached zero. + * Returns %true on success; %false on failure. + * + * This function is safe to call as long as @ref is between init and exit. + */ +static inline bool percpu_ref_tryget(struct percpu_ref *ref) +{ + return percpu_ref_tryget_many(ref, 1); +} + +/** * percpu_ref_tryget_live - try to increment a live percpu refcount * @ref: percpu_ref to try-get * diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index b352d66b5d51..27bd9e4f927b 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete, * io_uring_submit_sqe - called before submitting one SQE * * @ctx: pointer to a ring context structure + * @opcode: opcode of request * @user_data: user data associated with the request * @force_nonblock: whether a context blocking or not * @sq_thread: true if sq_thread has submitted this SQE @@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete, */ TRACE_EVENT(io_uring_submit_sqe, - TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread), + TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock, + bool sq_thread), - TP_ARGS(ctx, user_data, force_nonblock, sq_thread), + TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread), TP_STRUCT__entry ( __field( void *, ctx ) + __field( u8, opcode ) __field( u64, user_data ) __field( bool, force_nonblock ) __field( bool, sq_thread ) @@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe, TP_fast_assign( __entry->ctx = ctx; + __entry->opcode = opcode; __entry->user_data = user_data; __entry->force_nonblock = force_nonblock; __entry->sq_thread = sq_thread; ), - TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d", - __entry->ctx, (unsigned long long) __entry->user_data, + TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d", + __entry->ctx, __entry->opcode, + (unsigned long long) __entry->user_data, __entry->force_nonblock, __entry->sq_thread) ); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 55cfcb71606d..3f7961c1c243 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -34,21 +34,43 @@ struct io_uring_sqe { __u32 timeout_flags; __u32 accept_flags; __u32 cancel_flags; + __u32 open_flags; + __u32 statx_flags; + __u32 fadvise_advice; }; __u64 user_data; /* data to be passed back at completion time */ union { - __u16 buf_index; /* index into fixed buffers, if used */ + struct { + /* index into fixed buffers, if used */ + __u16 buf_index; + /* personality to use, if used */ + __u16 personality; + }; __u64 __pad2[3]; }; }; +enum { + IOSQE_FIXED_FILE_BIT, + IOSQE_IO_DRAIN_BIT, + IOSQE_IO_LINK_BIT, + IOSQE_IO_HARDLINK_BIT, + IOSQE_ASYNC_BIT, +}; + /* * sqe->flags */ -#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ -#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ -#define IOSQE_IO_LINK (1U << 2) /* links next sqe */ -#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */ +/* use fixed fileset */ +#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT) +/* issue after inflight IO */ +#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT) +/* links next sqe */ +#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT) +/* like LINK, but stronger */ +#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT) +/* always go async */ +#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) /* * io_uring_setup() flags @@ -57,6 +79,8 @@ struct io_uring_sqe { #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ +#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */ +#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ enum { IORING_OP_NOP, @@ -76,6 +100,19 @@ enum { IORING_OP_ASYNC_CANCEL, IORING_OP_LINK_TIMEOUT, IORING_OP_CONNECT, + IORING_OP_FALLOCATE, + IORING_OP_OPENAT, + IORING_OP_CLOSE, + IORING_OP_FILES_UPDATE, + IORING_OP_STATX, + IORING_OP_READ, + IORING_OP_WRITE, + IORING_OP_FADVISE, + IORING_OP_MADVISE, + IORING_OP_SEND, + IORING_OP_RECV, + IORING_OP_OPENAT2, + IORING_OP_EPOLL_CTL, /* this goes last, obviously */ IORING_OP_LAST, @@ -153,7 +190,8 @@ struct io_uring_params { __u32 sq_thread_cpu; __u32 sq_thread_idle; __u32 features; - __u32 resv[4]; + __u32 wq_fd; + __u32 resv[3]; struct io_sqring_offsets sq_off; struct io_cqring_offsets cq_off; }; @@ -164,6 +202,8 @@ struct io_uring_params { #define IORING_FEAT_SINGLE_MMAP (1U << 0) #define IORING_FEAT_NODROP (1U << 1) #define IORING_FEAT_SUBMIT_STABLE (1U << 2) +#define IORING_FEAT_RW_CUR_POS (1U << 3) +#define IORING_FEAT_CUR_PERSONALITY (1U << 4) /* * io_uring_register(2) opcodes and arguments @@ -175,6 +215,10 @@ struct io_uring_params { #define IORING_REGISTER_EVENTFD 4 #define IORING_UNREGISTER_EVENTFD 5 #define IORING_REGISTER_FILES_UPDATE 6 +#define IORING_REGISTER_EVENTFD_ASYNC 7 +#define IORING_REGISTER_PROBE 8 +#define IORING_REGISTER_PERSONALITY 9 +#define IORING_UNREGISTER_PERSONALITY 10 struct io_uring_files_update { __u32 offset; @@ -182,4 +226,21 @@ struct io_uring_files_update { __aligned_u64 /* __s32 * */ fds; }; +#define IO_URING_OP_SUPPORTED (1U << 0) + +struct io_uring_probe_op { + __u8 op; + __u8 resv; + __u16 flags; /* IO_URING_OP_* flags */ + __u32 resv2; +}; + +struct io_uring_probe { + __u8 last_op; /* last opcode supported */ + __u8 ops_len; /* length of ops[] array below */ + __u16 resv; + __u32 resv2[3]; + struct io_uring_probe_op ops[0]; +}; + #endif |