From 4b70ac5fd9b58bfaa5f25b4ea48f528aefbf3308 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2014 19:02:48 +0200 Subject: aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock() On 04/30, Benjamin LaHaise wrote: > > > - ctx->mmap_size = 0; > > - > > - kill_ioctx(mm, ctx, NULL); > > + if (ctx) { > > + ctx->mmap_size = 0; > > + kill_ioctx(mm, ctx, NULL); > > + } > > Rather than indenting and moving the two lines changing mmap_size and the > kill_ioctx() call, why not just do "if (!ctx) ... continue;"? That reduces > the number of lines changed and avoid excessive indentation. OK. To me the code looks better/simpler with "if (ctx)", but this is subjective of course, I won't argue. The patch still removes the empty line between mmap_size = 0 and kill_ioctx(), we reset mmap_size only for kill_ioctx(). But feel free to remove this change. ------------------------------------------------------------------------------- Subject: [PATCH v3 1/2] aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock() 1. We can read ->ioctx_table only once and we do not read rcu_read_lock() or even rcu_dereference(). This mm has no users, nobody else can play with ->ioctx_table. Otherwise the code is buggy anyway, if we need rcu_read_lock() in a loop because ->ioctx_table can be updated then kfree(table) is obviously wrong. 2. Update the comment. "exit_mmap(mm) is coming" is the good reason to avoid munmap(), but another reason is that we simply can't do vm_munmap() unless current->mm == mm and this is not true in general, the caller is mmput(). 3. We do not really need to nullify mm->ioctx_table before return, probably the current code does this to catch the potential problems. But in this case RCU_INIT_POINTER(NULL) looks better. Signed-off-by: Oleg Nesterov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 955947ef3e02..b6696462e345 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -791,40 +791,30 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { - struct kioctx_table *table; - struct kioctx *ctx; - unsigned i = 0; - - while (1) { - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - - do { - if (!table || i >= table->nr) { - rcu_read_unlock(); - rcu_assign_pointer(mm->ioctx_table, NULL); - if (table) - kfree(table); - return; - } + struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); + int i; - ctx = table->table[i++]; - } while (!ctx); + if (!table) + return; - rcu_read_unlock(); + for (i = 0; i < table->nr; ++i) { + struct kioctx *ctx = table->table[i]; + if (!ctx) + continue; /* - * We don't need to bother with munmap() here - - * exit_mmap(mm) is coming and it'll unmap everything. - * Since aio_free_ring() uses non-zero ->mmap_size - * as indicator that it needs to unmap the area, - * just set it to 0; aio_free_ring() is the only - * place that uses ->mmap_size, so it's safe. + * We don't need to bother with munmap() here - exit_mmap(mm) + * is coming and it'll unmap everything. And we simply can't, + * this is not necessarily our ->mm. + * Since kill_ioctx() uses non-zero ->mmap_size as indicator + * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, NULL); } + + RCU_INIT_POINTER(mm->ioctx_table, NULL); + kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) -- cgit v1.2.3-58-ga151 From 855ef0dec7271ff7be7381feaaf3f4aed80bd503 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2014 16:16:36 +0200 Subject: aio: kill the misleading rcu read locks in ioctx_add_table() and kill_ioctx() ioctx_add_table() is the writer, it does not need rcu_read_lock() to protect ->ioctx_table. It relies on mm->ioctx_lock and rcu locks just add the confusion. And it doesn't need rcu_dereference() by the same reason, it must see any updates previously done under the same ->ioctx_lock. We could use rcu_dereference_protected() but the patch uses rcu_dereference_raw(), the function is simple enough. The same for kill_ioctx(), although it does not update the pointer. Signed-off-by: Oleg Nesterov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index b6696462e345..c1d8c480c138 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -554,8 +554,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); + table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) @@ -563,7 +562,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) if (!table->table[i]) { ctx->id = i; table->table[i] = ctx; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -577,8 +575,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } new_nr = (table ? table->nr : 1) * 4; - - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * @@ -589,8 +585,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - old = rcu_dereference(mm->ioctx_table); + old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -737,12 +732,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - + table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* percpu_ref_kill() will do the necessary call_rcu() */ -- cgit v1.2.3-58-ga151 From be6fb451a24582732c66e28cb0beb3f19c4289fd Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 22 Jul 2014 09:56:56 -0400 Subject: aio: remove no longer needed preempt_disable() Based on feedback from Jens Axboe on 263782c1c95bbddbb022dc092fd89a36bb8d5577, clean up get/put_reqs_available() to remove the no longer needed preempt_disable() and preempt_enable() pair. Signed-off-by: Benjamin LaHaise Cc: Jens Axboe --- fs/aio.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 8216aa0c7539..9ce9e8ea773f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -814,10 +814,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) struct kioctx_cpu *kcpu; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { @@ -826,7 +824,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) } local_irq_restore(flags); - preempt_enable(); } static bool get_reqs_available(struct kioctx *ctx) @@ -835,10 +832,8 @@ static bool get_reqs_available(struct kioctx *ctx) bool ret = false; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -858,7 +853,6 @@ static bool get_reqs_available(struct kioctx *ctx) kcpu->reqs_available--; out: local_irq_restore(flags); - preempt_enable(); return ret; } -- cgit v1.2.3-58-ga151 From b53f1f82fbde8dcf34ab7d731c2a9ae6f0d8d2e2 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:51 +0800 Subject: aio: remove the needless registration of ring file's private_data Remove the registration of ring file's private_data, we do not use it. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 9ce9e8ea773f..55e03a858e8b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -192,7 +192,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) } file->f_flags = O_RDWR; - file->private_data = ctx; return file; } -- cgit v1.2.3-58-ga151 From 8dc4379e17cddad7b2088a3f300ded50d2a6d493 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:52 +0800 Subject: aio: use the macro rather than the inline magic number Replace the inline magic number with the ready-made macro(AIO_RING_MAGIC), just clean up. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 55e03a858e8b..6fc6b9857a58 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -201,7 +201,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); + return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); } /* aio_setup -- cgit v1.2.3-58-ga151 From 2be4e7deec2d4398a0eb2165cc04086ebfc831d2 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:53 +0800 Subject: aio: fix some comments The function comments of aio_run_iocb and aio_read_events are out of date, so fix them here. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 6fc6b9857a58..d6d9520f6866 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1020,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) } EXPORT_SYMBOL(aio_complete); -/* aio_read_events +/* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ @@ -1272,9 +1272,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, } /* - * aio_setup_iocb: - * Performs the initial checks and aio retry method - * setup for the kiocb at the time of io submission. + * aio_run_iocb: + * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, char __user *buf, bool compat) -- cgit v1.2.3-58-ga151 From 00fefb9cf2b5493a86912de55ba912bdfae4a207 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:54 +0800 Subject: aio: use iovec array rather than the single one Previously, we only offer a single iovec to handle all the read/write cases, so the PREADV/PWRITEV request always need to alloc more iovec buffer when copying user vectors. If we use a tmp iovec array rather than the single one, some small PREADV/PWRITEV workloads(vector size small than the tmp buffer) will not need to alloc more iovec buffer when copying user vectors. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index d6d9520f6866..0fd9181d8c0c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1243,12 +1243,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, if (compat) ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, (struct iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); if (ret < 0) return ret; @@ -1285,7 +1285,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, fmode_t mode; aio_rw_op *rw_op; rw_iter_op *iter_op; - struct iovec inline_vec, *iovec = &inline_vec; + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; switch (opcode) { @@ -1320,7 +1320,7 @@ rw_common: if (!ret) ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); return ret; } @@ -1367,7 +1367,7 @@ rw_common: return -EINVAL; } - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); if (ret != -EIOCBQUEUED) { -- cgit v1.2.3-58-ga151 From d856f32a86b2b015ab180ab7a55e455ed8d3ccc5 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sun, 24 Aug 2014 13:14:05 -0400 Subject: aio: fix reqs_available handling As reported by Dan Aloni, commit f8567a3845ac ("aio: fix aio request leak when events are reaped by userspace") introduces a regression when user code attempts to perform io_submit() with more events than are available in the ring buffer. Reverting that commit would reintroduce a regression when user space event reaping is used. Fixing this bug is a bit more involved than the previous attempts to fix this regression. Since we do not have a single point at which we can count events as being reaped by user space and io_getevents(), we have to track event completion by looking at the number of events left in the event ring. So long as there are as many events in the ring buffer as there have been completion events generate, we cannot call put_reqs_available(). The code to check for this is now placed in refill_reqs_available(). A test program from Dan and modified by me for verifying this bug is available at http://www.kvack.org/~bcrl/20140824-aio_bug.c . Reported-by: Dan Aloni Signed-off-by: Benjamin LaHaise Acked-by: Dan Aloni Cc: Kent Overstreet Cc: Mateusz Guzik Cc: Petr Matousek Cc: stable@vger.kernel.org # v3.16 and anything that f8567a3845ac was backported to Signed-off-by: Linus Torvalds --- fs/aio.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 4 deletions(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index ae635872affb..97bc62cbe2da 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -141,6 +141,7 @@ struct kioctx { struct { unsigned tail; + unsigned completed_events; spinlock_t completion_lock; } ____cacheline_aligned_in_smp; @@ -857,6 +858,68 @@ out: return ret; } +/* refill_reqs_available + * Updates the reqs_available reference counts used for tracking the + * number of free slots in the completion ring. This can be called + * from aio_complete() (to optimistically update reqs_available) or + * from aio_get_req() (the we're out of events case). It must be + * called holding ctx->completion_lock. + */ +static void refill_reqs_available(struct kioctx *ctx, unsigned head, + unsigned tail) +{ + unsigned events_in_ring, completed; + + /* Clamp head since userland can write to it. */ + head %= ctx->nr_events; + if (head <= tail) + events_in_ring = tail - head; + else + events_in_ring = ctx->nr_events - (head - tail); + + completed = ctx->completed_events; + if (events_in_ring < completed) + completed -= events_in_ring; + else + completed = 0; + + if (!completed) + return; + + ctx->completed_events -= completed; + put_reqs_available(ctx, completed); +} + +/* user_refill_reqs_available + * Called to refill reqs_available when aio_get_req() encounters an + * out of space in the completion ring. + */ +static void user_refill_reqs_available(struct kioctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + if (ctx->completed_events) { + struct aio_ring *ring; + unsigned head; + + /* Access of ring->head may race with aio_read_events_ring() + * here, but that's okay since whether we read the old version + * or the new version, and either will be valid. The important + * part is that head cannot pass tail since we prevent + * aio_complete() from updating tail by holding + * ctx->completion_lock. Even if head is invalid, the check + * against ctx->completed_events below will make sure we do the + * safe/right thing. + */ + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + refill_reqs_available(ctx, head, ctx->tail); + } + + spin_unlock_irq(&ctx->completion_lock); +} + /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. @@ -865,8 +928,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (!get_reqs_available(ctx)) - return NULL; + if (!get_reqs_available(ctx)) { + user_refill_reqs_available(ctx); + if (!get_reqs_available(ctx)) + return NULL; + } req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) @@ -925,8 +991,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; + unsigned tail, pos, head; unsigned long flags; - unsigned tail, pos; /* * Special case handling for sync iocbs: @@ -987,10 +1053,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2) ctx->tail = tail; ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; ring->tail = tail; kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); + ctx->completed_events++; + if (ctx->completed_events > 1) + refill_reqs_available(ctx, head, tail); spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1005,7 +1075,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); - put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test -- cgit v1.2.3-58-ga151 From 2ff396be602f10b5eab8e73b24f20348fa2de159 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 2 Sep 2014 13:17:00 -0400 Subject: aio: add missing smp_rmb() in read_events_ring We ran into a case on ppc64 running mariadb where io_getevents would return zeroed out I/O events. After adding instrumentation, it became clear that there was some missing synchronization between reading the tail pointer and the events themselves. This small patch fixes the problem in testing. Thanks to Zach for helping to look into this, and suggesting the fix. Signed-off-by: Jeff Moyer Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 97bc62cbe2da..5f2e9c6c328e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1111,6 +1111,12 @@ static long aio_read_events_ring(struct kioctx *ctx, tail = ring->tail; kunmap_atomic(ring); + /* + * Ensure that once we've read the current tail pointer, that + * we also see the events that were stored up to the tail. + */ + smp_rmb(); + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); if (head == tail) -- cgit v1.2.3-58-ga151 From 6098b45b32e6baeacc04790773ced9340601d511 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 3 Sep 2014 17:45:44 +0800 Subject: aio: block exit_aio() until all context requests are completed It seems that exit_aio() also needs to wait for all iocbs to complete (like io_destroy), but we missed the wait step in current implemention, so fix it in the same way as we did in io_destroy. Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/aio.c') diff --git a/fs/aio.c b/fs/aio.c index 5f2e9c6c328e..733750096b71 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -793,6 +793,8 @@ void exit_aio(struct mm_struct *mm) for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = table->table[i]; + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); if (!ctx) continue; @@ -804,7 +806,10 @@ void exit_aio(struct mm_struct *mm) * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, NULL); + kill_ioctx(mm, ctx, &requests_done); + + /* Wait until all IO for the context are done. */ + wait_for_completion(&requests_done); } RCU_INIT_POINTER(mm->ioctx_table, NULL); -- cgit v1.2.3-58-ga151