From 4b70ac5fd9b58bfaa5f25b4ea48f528aefbf3308 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 30 Apr 2014 19:02:48 +0200
Subject: aio: change exit_aio() to load mm->ioctx_table once and avoid
 rcu_read_lock()

On 04/30, Benjamin LaHaise wrote:
>
> > -		ctx->mmap_size = 0;
> > -
> > -		kill_ioctx(mm, ctx, NULL);
> > +		if (ctx) {
> > +			ctx->mmap_size = 0;
> > +			kill_ioctx(mm, ctx, NULL);
> > +		}
>
> Rather than indenting and moving the two lines changing mmap_size and the
> kill_ioctx() call, why not just do "if (!ctx) ... continue;"?  That reduces
> the number of lines changed and avoid excessive indentation.

OK. To me the code looks better/simpler with "if (ctx)", but this is subjective
of course, I won't argue.

The patch still removes the empty line between mmap_size = 0 and kill_ioctx(),
we reset mmap_size only for kill_ioctx(). But feel free to remove this change.

-------------------------------------------------------------------------------
Subject: [PATCH v3 1/2] aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock()

1. We can read ->ioctx_table only once and we do not read rcu_read_lock()
   or even rcu_dereference().

   This mm has no users, nobody else can play with ->ioctx_table. Otherwise
   the code is buggy anyway, if we need rcu_read_lock() in a loop because
   ->ioctx_table can be updated then kfree(table) is obviously wrong.

2. Update the comment. "exit_mmap(mm) is coming" is the good reason to avoid
   munmap(), but another reason is that we simply can't do vm_munmap() unless
   current->mm == mm and this is not true in general, the caller is mmput().

3. We do not really need to nullify mm->ioctx_table before return, probably
   the current code does this to catch the potential problems. But in this
   case RCU_INIT_POINTER(NULL) looks better.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 42 ++++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 955947ef3e02..b6696462e345 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -791,40 +791,30 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
  */
 void exit_aio(struct mm_struct *mm)
 {
-	struct kioctx_table *table;
-	struct kioctx *ctx;
-	unsigned i = 0;
-
-	while (1) {
-		rcu_read_lock();
-		table = rcu_dereference(mm->ioctx_table);
-
-		do {
-			if (!table || i >= table->nr) {
-				rcu_read_unlock();
-				rcu_assign_pointer(mm->ioctx_table, NULL);
-				if (table)
-					kfree(table);
-				return;
-			}
+	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
+	int i;
 
-			ctx = table->table[i++];
-		} while (!ctx);
+	if (!table)
+		return;
 
-		rcu_read_unlock();
+	for (i = 0; i < table->nr; ++i) {
+		struct kioctx *ctx = table->table[i];
 
+		if (!ctx)
+			continue;
 		/*
-		 * We don't need to bother with munmap() here -
-		 * exit_mmap(mm) is coming and it'll unmap everything.
-		 * Since aio_free_ring() uses non-zero ->mmap_size
-		 * as indicator that it needs to unmap the area,
-		 * just set it to 0; aio_free_ring() is the only
-		 * place that uses ->mmap_size, so it's safe.
+		 * We don't need to bother with munmap() here - exit_mmap(mm)
+		 * is coming and it'll unmap everything. And we simply can't,
+		 * this is not necessarily our ->mm.
+		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
+		 * that it needs to unmap the area, just set it to 0.
 		 */
 		ctx->mmap_size = 0;
-
 		kill_ioctx(mm, ctx, NULL);
 	}
+
+	RCU_INIT_POINTER(mm->ioctx_table, NULL);
+	kfree(table);
 }
 
 static void put_reqs_available(struct kioctx *ctx, unsigned nr)
-- 
cgit v1.2.3-58-ga151


From 855ef0dec7271ff7be7381feaaf3f4aed80bd503 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 30 Apr 2014 16:16:36 +0200
Subject: aio: kill the misleading rcu read locks in ioctx_add_table() and
 kill_ioctx()

ioctx_add_table() is the writer, it does not need rcu_read_lock() to
protect ->ioctx_table. It relies on mm->ioctx_lock and rcu locks just
add the confusion.

And it doesn't need rcu_dereference() by the same reason, it must see
any updates previously done under the same ->ioctx_lock. We could use
rcu_dereference_protected() but the patch uses rcu_dereference_raw(),
the function is simple enough.

The same for kill_ioctx(), although it does not update the pointer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index b6696462e345..c1d8c480c138 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -554,8 +554,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 	struct aio_ring *ring;
 
 	spin_lock(&mm->ioctx_lock);
-	rcu_read_lock();
-	table = rcu_dereference(mm->ioctx_table);
+	table = rcu_dereference_raw(mm->ioctx_table);
 
 	while (1) {
 		if (table)
@@ -563,7 +562,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 				if (!table->table[i]) {
 					ctx->id = i;
 					table->table[i] = ctx;
-					rcu_read_unlock();
 					spin_unlock(&mm->ioctx_lock);
 
 					/* While kioctx setup is in progress,
@@ -577,8 +575,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 				}
 
 		new_nr = (table ? table->nr : 1) * 4;
-
-		rcu_read_unlock();
 		spin_unlock(&mm->ioctx_lock);
 
 		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
@@ -589,8 +585,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 		table->nr = new_nr;
 
 		spin_lock(&mm->ioctx_lock);
-		rcu_read_lock();
-		old = rcu_dereference(mm->ioctx_table);
+		old = rcu_dereference_raw(mm->ioctx_table);
 
 		if (!old) {
 			rcu_assign_pointer(mm->ioctx_table, table);
@@ -737,12 +732,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 
 
 	spin_lock(&mm->ioctx_lock);
-	rcu_read_lock();
-	table = rcu_dereference(mm->ioctx_table);
-
+	table = rcu_dereference_raw(mm->ioctx_table);
 	WARN_ON(ctx != table->table[ctx->id]);
 	table->table[ctx->id] = NULL;
-	rcu_read_unlock();
 	spin_unlock(&mm->ioctx_lock);
 
 	/* percpu_ref_kill() will do the necessary call_rcu() */
-- 
cgit v1.2.3-58-ga151


From be6fb451a24582732c66e28cb0beb3f19c4289fd Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Tue, 22 Jul 2014 09:56:56 -0400
Subject: aio: remove no longer needed preempt_disable()

Based on feedback from Jens Axboe on 263782c1c95bbddbb022dc092fd89a36bb8d5577,
clean up get/put_reqs_available() to remove the no longer needed preempt_disable()
and preempt_enable() pair.

Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 8216aa0c7539..9ce9e8ea773f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -814,10 +814,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
 	struct kioctx_cpu *kcpu;
 	unsigned long flags;
 
-	preempt_disable();
-	kcpu = this_cpu_ptr(ctx->cpu);
-
 	local_irq_save(flags);
+	kcpu = this_cpu_ptr(ctx->cpu);
 	kcpu->reqs_available += nr;
 
 	while (kcpu->reqs_available >= ctx->req_batch * 2) {
@@ -826,7 +824,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
 	}
 
 	local_irq_restore(flags);
-	preempt_enable();
 }
 
 static bool get_reqs_available(struct kioctx *ctx)
@@ -835,10 +832,8 @@ static bool get_reqs_available(struct kioctx *ctx)
 	bool ret = false;
 	unsigned long flags;
 
-	preempt_disable();
-	kcpu = this_cpu_ptr(ctx->cpu);
-
 	local_irq_save(flags);
+	kcpu = this_cpu_ptr(ctx->cpu);
 	if (!kcpu->reqs_available) {
 		int old, avail = atomic_read(&ctx->reqs_available);
 
@@ -858,7 +853,6 @@ static bool get_reqs_available(struct kioctx *ctx)
 	kcpu->reqs_available--;
 out:
 	local_irq_restore(flags);
-	preempt_enable();
 	return ret;
 }
 
-- 
cgit v1.2.3-58-ga151


From b53f1f82fbde8dcf34ab7d731c2a9ae6f0d8d2e2 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 23 Jul 2014 18:03:51 +0800
Subject: aio: remove the needless registration of ring file's private_data

Remove the registration of ring file's private_data, we do not use
it.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 9ce9e8ea773f..55e03a858e8b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -192,7 +192,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 	}
 
 	file->f_flags = O_RDWR;
-	file->private_data = ctx;
 	return file;
 }
 
-- 
cgit v1.2.3-58-ga151


From 8dc4379e17cddad7b2088a3f300ded50d2a6d493 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 23 Jul 2014 18:03:52 +0800
Subject: aio: use the macro rather than the inline magic number

Replace the inline magic number with the ready-made macro(AIO_RING_MAGIC),
just clean up.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 55e03a858e8b..6fc6b9857a58 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -201,7 +201,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
 	static const struct dentry_operations ops = {
 		.d_dname	= simple_dname,
 	};
-	return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1);
+	return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
 }
 
 /* aio_setup
-- 
cgit v1.2.3-58-ga151


From 2be4e7deec2d4398a0eb2165cc04086ebfc831d2 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 23 Jul 2014 18:03:53 +0800
Subject: aio: fix some comments

The function comments of aio_run_iocb and aio_read_events are out of date, so
fix them here.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 6fc6b9857a58..d6d9520f6866 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1020,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 }
 EXPORT_SYMBOL(aio_complete);
 
-/* aio_read_events
+/* aio_read_events_ring
  *	Pull an event off of the ioctx's event ring.  Returns the number of
  *	events fetched
  */
@@ -1272,9 +1272,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
 }
 
 /*
- * aio_setup_iocb:
- *	Performs the initial checks and aio retry method
- *	setup for the kiocb at the time of io submission.
+ * aio_run_iocb:
+ *	Performs the initial checks and io submission.
  */
 static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 			    char __user *buf, bool compat)
-- 
cgit v1.2.3-58-ga151


From 00fefb9cf2b5493a86912de55ba912bdfae4a207 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 23 Jul 2014 18:03:54 +0800
Subject: aio: use iovec array rather than the single one

Previously, we only offer a single iovec to handle all the read/write cases, so
the PREADV/PWRITEV request always need to alloc more iovec buffer when copying
user vectors.
If we use a tmp iovec array rather than the single one, some small PREADV/PWRITEV
workloads(vector size small than the tmp buffer) will not need to alloc more
iovec buffer when copying user vectors.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
---
 fs/aio.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index d6d9520f6866..0fd9181d8c0c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1243,12 +1243,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
 	if (compat)
 		ret = compat_rw_copy_check_uvector(rw,
 				(struct compat_iovec __user *)buf,
-				*nr_segs, 1, *iovec, iovec);
+				*nr_segs, UIO_FASTIOV, *iovec, iovec);
 	else
 #endif
 		ret = rw_copy_check_uvector(rw,
 				(struct iovec __user *)buf,
-				*nr_segs, 1, *iovec, iovec);
+				*nr_segs, UIO_FASTIOV, *iovec, iovec);
 	if (ret < 0)
 		return ret;
 
@@ -1285,7 +1285,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
 	fmode_t mode;
 	aio_rw_op *rw_op;
 	rw_iter_op *iter_op;
-	struct iovec inline_vec, *iovec = &inline_vec;
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
 
 	switch (opcode) {
@@ -1320,7 +1320,7 @@ rw_common:
 		if (!ret)
 			ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
 		if (ret < 0) {
-			if (iovec != &inline_vec)
+			if (iovec != inline_vecs)
 				kfree(iovec);
 			return ret;
 		}
@@ -1367,7 +1367,7 @@ rw_common:
 		return -EINVAL;
 	}
 
-	if (iovec != &inline_vec)
+	if (iovec != inline_vecs)
 		kfree(iovec);
 
 	if (ret != -EIOCBQUEUED) {
-- 
cgit v1.2.3-58-ga151


From d856f32a86b2b015ab180ab7a55e455ed8d3ccc5 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Sun, 24 Aug 2014 13:14:05 -0400
Subject: aio: fix reqs_available handling

As reported by Dan Aloni, commit f8567a3845ac ("aio: fix aio request
leak when events are reaped by userspace") introduces a regression when
user code attempts to perform io_submit() with more events than are
available in the ring buffer.  Reverting that commit would reintroduce a
regression when user space event reaping is used.

Fixing this bug is a bit more involved than the previous attempts to fix
this regression.  Since we do not have a single point at which we can
count events as being reaped by user space and io_getevents(), we have
to track event completion by looking at the number of events left in the
event ring.  So long as there are as many events in the ring buffer as
there have been completion events generate, we cannot call
put_reqs_available().  The code to check for this is now placed in
refill_reqs_available().

A test program from Dan and modified by me for verifying this bug is available
at http://www.kvack.org/~bcrl/20140824-aio_bug.c .

Reported-by: Dan Aloni <dan@kernelim.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Acked-by: Dan Aloni <dan@kernelim.com>
Cc: Kent Overstreet <kmo@daterainc.com>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: stable@vger.kernel.org      # v3.16 and anything that f8567a3845ac was backported to
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 4 deletions(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index ae635872affb..97bc62cbe2da 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
 
 	struct {
 		unsigned	tail;
+		unsigned	completed_events;
 		spinlock_t	completion_lock;
 	} ____cacheline_aligned_in_smp;
 
@@ -857,6 +858,68 @@ out:
 	return ret;
 }
 
+/* refill_reqs_available
+ *	Updates the reqs_available reference counts used for tracking the
+ *	number of free slots in the completion ring.  This can be called
+ *	from aio_complete() (to optimistically update reqs_available) or
+ *	from aio_get_req() (the we're out of events case).  It must be
+ *	called holding ctx->completion_lock.
+ */
+static void refill_reqs_available(struct kioctx *ctx, unsigned head,
+                                  unsigned tail)
+{
+	unsigned events_in_ring, completed;
+
+	/* Clamp head since userland can write to it. */
+	head %= ctx->nr_events;
+	if (head <= tail)
+		events_in_ring = tail - head;
+	else
+		events_in_ring = ctx->nr_events - (head - tail);
+
+	completed = ctx->completed_events;
+	if (events_in_ring < completed)
+		completed -= events_in_ring;
+	else
+		completed = 0;
+
+	if (!completed)
+		return;
+
+	ctx->completed_events -= completed;
+	put_reqs_available(ctx, completed);
+}
+
+/* user_refill_reqs_available
+ *	Called to refill reqs_available when aio_get_req() encounters an
+ *	out of space in the completion ring.
+ */
+static void user_refill_reqs_available(struct kioctx *ctx)
+{
+	spin_lock_irq(&ctx->completion_lock);
+	if (ctx->completed_events) {
+		struct aio_ring *ring;
+		unsigned head;
+
+		/* Access of ring->head may race with aio_read_events_ring()
+		 * here, but that's okay since whether we read the old version
+		 * or the new version, and either will be valid.  The important
+		 * part is that head cannot pass tail since we prevent
+		 * aio_complete() from updating tail by holding
+		 * ctx->completion_lock.  Even if head is invalid, the check
+		 * against ctx->completed_events below will make sure we do the
+		 * safe/right thing.
+		 */
+		ring = kmap_atomic(ctx->ring_pages[0]);
+		head = ring->head;
+		kunmap_atomic(ring);
+
+		refill_reqs_available(ctx, head, ctx->tail);
+	}
+
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
 /* aio_get_req
  *	Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
@@ -865,8 +928,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
 	struct kiocb *req;
 
-	if (!get_reqs_available(ctx))
-		return NULL;
+	if (!get_reqs_available(ctx)) {
+		user_refill_reqs_available(ctx);
+		if (!get_reqs_available(ctx))
+			return NULL;
+	}
 
 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
 	if (unlikely(!req))
@@ -925,8 +991,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
 	struct io_event	*ev_page, *event;
+	unsigned tail, pos, head;
 	unsigned long	flags;
-	unsigned tail, pos;
 
 	/*
 	 * Special case handling for sync iocbs:
@@ -987,10 +1053,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	ctx->tail = tail;
 
 	ring = kmap_atomic(ctx->ring_pages[0]);
+	head = ring->head;
 	ring->tail = tail;
 	kunmap_atomic(ring);
 	flush_dcache_page(ctx->ring_pages[0]);
 
+	ctx->completed_events++;
+	if (ctx->completed_events > 1)
+		refill_reqs_available(ctx, head, tail);
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 
 	pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1075,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 
 	/* everything turned out well, dispose of the aiocb. */
 	kiocb_free(iocb);
-	put_reqs_available(ctx, 1);
 
 	/*
 	 * We have to order our ring_info tail store above and test
-- 
cgit v1.2.3-58-ga151


From 2ff396be602f10b5eab8e73b24f20348fa2de159 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 2 Sep 2014 13:17:00 -0400
Subject: aio: add missing smp_rmb() in read_events_ring

We ran into a case on ppc64 running mariadb where io_getevents would
return zeroed out I/O events.  After adding instrumentation, it became
clear that there was some missing synchronization between reading the
tail pointer and the events themselves.  This small patch fixes the
problem in testing.

Thanks to Zach for helping to look into this, and suggesting the fix.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Cc: stable@vger.kernel.org
---
 fs/aio.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 97bc62cbe2da..5f2e9c6c328e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1111,6 +1111,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
 	tail = ring->tail;
 	kunmap_atomic(ring);
 
+	/*
+	 * Ensure that once we've read the current tail pointer, that
+	 * we also see the events that were stored up to the tail.
+	 */
+	smp_rmb();
+
 	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
 
 	if (head == tail)
-- 
cgit v1.2.3-58-ga151


From 6098b45b32e6baeacc04790773ced9340601d511 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 3 Sep 2014 17:45:44 +0800
Subject: aio: block exit_aio() until all context requests are completed

It seems that exit_aio() also needs to wait for all iocbs to complete (like
io_destroy), but we missed the wait step in current implemention, so fix
it in the same way as we did in io_destroy.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Cc: stable@vger.kernel.org
---
 fs/aio.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs/aio.c')

diff --git a/fs/aio.c b/fs/aio.c
index 5f2e9c6c328e..733750096b71 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -793,6 +793,8 @@ void exit_aio(struct mm_struct *mm)
 
 	for (i = 0; i < table->nr; ++i) {
 		struct kioctx *ctx = table->table[i];
+		struct completion requests_done =
+			COMPLETION_INITIALIZER_ONSTACK(requests_done);
 
 		if (!ctx)
 			continue;
@@ -804,7 +806,10 @@ void exit_aio(struct mm_struct *mm)
 		 * that it needs to unmap the area, just set it to 0.
 		 */
 		ctx->mmap_size = 0;
-		kill_ioctx(mm, ctx, NULL);
+		kill_ioctx(mm, ctx, &requests_done);
+
+		/* Wait until all IO for the context are done. */
+		wait_for_completion(&requests_done);
 	}
 
 	RCU_INIT_POINTER(mm->ioctx_table, NULL);
-- 
cgit v1.2.3-58-ga151