summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-01-29 18:53:37 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-01-29 18:53:37 -0800
commit896f8d23d0cb5889021d66eab6107e97109c5459 (patch)
treef8ceade3203209679ece27d3cace410178dfc99c /include
parent33c84e89abe4a92ab699c33029bd54269d574782 (diff)
parent3e4827b05d2ac2d377ed136a52829ec46787bf4b (diff)
Merge tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: - Support for various new opcodes (fallocate, openat, close, statx, fadvise, madvise, openat2, non-vectored read/write, send/recv, and epoll_ctl) - Faster ring quiesce for fileset updates - Optimizations for overflow condition checking - Support for max-sized clamping - Support for probing what opcodes are supported - Support for io-wq backend sharing between "sibling" rings - Support for registering personalities - Lots of little fixes and improvements * tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block: (64 commits) io_uring: add support for epoll_ctl(2) eventpoll: support non-blocking do_epoll_ctl() calls eventpoll: abstract out epoll_ctl() handler io_uring: fix linked command file table usage io_uring: support using a registered personality for commands io_uring: allow registering credentials io_uring: add io-wq workqueue sharing io-wq: allow grabbing existing io-wq io_uring/io-wq: don't use static creds/mm assignments io-wq: make the io_wq ref counted io_uring: fix refcounting with batched allocations at OOM io_uring: add comment for drain_next io_uring: don't attempt to copy iovec for READ/WRITE io_uring: honor IOSQE_ASYNC for linked reqs io_uring: prep req when do IOSQE_ASYNC io_uring: use labeled array init in io_op_defs io_uring: optimise sqe-to-req flags translation io_uring: remove REQ_F_IO_DRAINED io_uring: file switch work needs to get flushed on exit io_uring: hide uring_fd in ctx ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/eventpoll.h9
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/percpu-refcount.h26
-rw-r--r--include/trace/events/io_uring.h13
-rw-r--r--include/uapi/linux/io_uring.h73
5 files changed, 107 insertions, 15 deletions
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index bc6d79b00c4e..8f000fada5a4 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file);
}
+int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
+ bool nonblock);
+
+/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
+static inline int ep_op_has_event(int op)
+{
+ return op != EPOLL_CTL_DEL;
+}
+
#else
static inline void eventpoll_init_file(struct file *file) {}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 67f8451b9a12..1233bf45164d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2323,6 +2323,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
+extern int do_madvise(unsigned long start, size_t len_in, int behavior);
static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr,
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 390031e816dc..22d9d183950d 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -210,15 +210,17 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
}
/**
- * percpu_ref_tryget - try to increment a percpu refcount
+ * percpu_ref_tryget_many - try to increment a percpu refcount
* @ref: percpu_ref to try-get
+ * @nr: number of references to get
*
- * Increment a percpu refcount unless its count already reached zero.
+ * Increment a percpu refcount by @nr unless its count already reached zero.
* Returns %true on success; %false on failure.
*
* This function is safe to call as long as @ref is between init and exit.
*/
-static inline bool percpu_ref_tryget(struct percpu_ref *ref)
+static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
+ unsigned long nr)
{
unsigned long __percpu *percpu_count;
bool ret;
@@ -226,10 +228,10 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
- this_cpu_inc(*percpu_count);
+ this_cpu_add(*percpu_count, nr);
ret = true;
} else {
- ret = atomic_long_inc_not_zero(&ref->count);
+ ret = atomic_long_add_unless(&ref->count, nr, 0);
}
rcu_read_unlock();
@@ -238,6 +240,20 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
}
/**
+ * percpu_ref_tryget - try to increment a percpu refcount
+ * @ref: percpu_ref to try-get
+ *
+ * Increment a percpu refcount unless its count already reached zero.
+ * Returns %true on success; %false on failure.
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline bool percpu_ref_tryget(struct percpu_ref *ref)
+{
+ return percpu_ref_tryget_many(ref, 1);
+}
+
+/**
* percpu_ref_tryget_live - try to increment a live percpu refcount
* @ref: percpu_ref to try-get
*
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index b352d66b5d51..27bd9e4f927b 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete,
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
+ * @opcode: opcode of request
* @user_data: user data associated with the request
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
@@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete,
*/
TRACE_EVENT(io_uring_submit_sqe,
- TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
+ TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
+ bool sq_thread),
- TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
+ TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
TP_STRUCT__entry (
__field( void *, ctx )
+ __field( u8, opcode )
__field( u64, user_data )
__field( bool, force_nonblock )
__field( bool, sq_thread )
@@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe,
TP_fast_assign(
__entry->ctx = ctx;
+ __entry->opcode = opcode;
__entry->user_data = user_data;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
),
- TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
- __entry->ctx, (unsigned long long) __entry->user_data,
+ TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
+ __entry->ctx, __entry->opcode,
+ (unsigned long long) __entry->user_data,
__entry->force_nonblock, __entry->sq_thread)
);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 55cfcb71606d..3f7961c1c243 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -34,21 +34,43 @@ struct io_uring_sqe {
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
+ __u32 open_flags;
+ __u32 statx_flags;
+ __u32 fadvise_advice;
};
__u64 user_data; /* data to be passed back at completion time */
union {
- __u16 buf_index; /* index into fixed buffers, if used */
+ struct {
+ /* index into fixed buffers, if used */
+ __u16 buf_index;
+ /* personality to use, if used */
+ __u16 personality;
+ };
__u64 __pad2[3];
};
};
+enum {
+ IOSQE_FIXED_FILE_BIT,
+ IOSQE_IO_DRAIN_BIT,
+ IOSQE_IO_LINK_BIT,
+ IOSQE_IO_HARDLINK_BIT,
+ IOSQE_ASYNC_BIT,
+};
+
/*
* sqe->flags
*/
-#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
-#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
-#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
-#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */
+/* use fixed fileset */
+#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
+/* issue after inflight IO */
+#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
+/* links next sqe */
+#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
+/* like LINK, but stronger */
+#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
+/* always go async */
+#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/*
* io_uring_setup() flags
@@ -57,6 +79,8 @@ struct io_uring_sqe {
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
+#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
+#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
enum {
IORING_OP_NOP,
@@ -76,6 +100,19 @@ enum {
IORING_OP_ASYNC_CANCEL,
IORING_OP_LINK_TIMEOUT,
IORING_OP_CONNECT,
+ IORING_OP_FALLOCATE,
+ IORING_OP_OPENAT,
+ IORING_OP_CLOSE,
+ IORING_OP_FILES_UPDATE,
+ IORING_OP_STATX,
+ IORING_OP_READ,
+ IORING_OP_WRITE,
+ IORING_OP_FADVISE,
+ IORING_OP_MADVISE,
+ IORING_OP_SEND,
+ IORING_OP_RECV,
+ IORING_OP_OPENAT2,
+ IORING_OP_EPOLL_CTL,
/* this goes last, obviously */
IORING_OP_LAST,
@@ -153,7 +190,8 @@ struct io_uring_params {
__u32 sq_thread_cpu;
__u32 sq_thread_idle;
__u32 features;
- __u32 resv[4];
+ __u32 wq_fd;
+ __u32 resv[3];
struct io_sqring_offsets sq_off;
struct io_cqring_offsets cq_off;
};
@@ -164,6 +202,8 @@ struct io_uring_params {
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
#define IORING_FEAT_NODROP (1U << 1)
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
+#define IORING_FEAT_RW_CUR_POS (1U << 3)
+#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
/*
* io_uring_register(2) opcodes and arguments
@@ -175,6 +215,10 @@ struct io_uring_params {
#define IORING_REGISTER_EVENTFD 4
#define IORING_UNREGISTER_EVENTFD 5
#define IORING_REGISTER_FILES_UPDATE 6
+#define IORING_REGISTER_EVENTFD_ASYNC 7
+#define IORING_REGISTER_PROBE 8
+#define IORING_REGISTER_PERSONALITY 9
+#define IORING_UNREGISTER_PERSONALITY 10
struct io_uring_files_update {
__u32 offset;
@@ -182,4 +226,21 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
+#define IO_URING_OP_SUPPORTED (1U << 0)
+
+struct io_uring_probe_op {
+ __u8 op;
+ __u8 resv;
+ __u16 flags; /* IO_URING_OP_* flags */
+ __u32 resv2;
+};
+
+struct io_uring_probe {
+ __u8 last_op; /* last opcode supported */
+ __u8 ops_len; /* length of ops[] array below */
+ __u16 resv;
+ __u32 resv2[3];
+ struct io_uring_probe_op ops[0];
+};
+
#endif