From 793d7dbe6d82a50b9d14bf992b9eaacb70a11ce6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 13 Oct 2017 09:47:45 -0700
Subject: xfs: cancel dirty pages on invalidation

Recently we've had warnings arise from the vm handing us pages
without bufferheads attached to them. This should not ever occur
in XFS, but we don't defend against it properly if it does. The only
place where we remove bufferheads from a page is in
xfs_vm_releasepage(), but we can't tell the difference here between
"page is dirty so don't release" and "page is dirty but is being
invalidated so release it".

In some places that are invalidating pages ask for pages to be
released and follow up afterward calling ->releasepage by checking
whether the page was dirty and then aborting the invalidation. This
is a possible vector for releasing buffers from a page but then
leaving it in the mapping, so we really do need to avoid dirty pages
in xfs_vm_releasepage().

To differentiate between invalidated pages and normal pages, we need
to clear the page dirty flag when invalidating the pages. This can
be done through xfs_vm_invalidatepage(), and will result
xfs_vm_releasepage() seeing the page as clean which matches the
bufferhead state on the page after calling block_invalidatepage().

Hence we can re-add the page dirty check in xfs_vm_releasepage to
catch the case where we might be releasing a page that is actually
dirty and so should not have the bufferheads on it removed. This
will remove one possible vector of "dirty page with no bufferheads"
and so help narrow down the search for the root cause of that
problem.

Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_aops.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f18e5932aec4..067284d84d9e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -735,6 +735,14 @@ xfs_vm_invalidatepage(
 {
 	trace_xfs_invalidatepage(page->mapping->host, page, offset,
 				 length);
+
+	/*
+	 * If we are invalidating the entire page, clear the dirty state from it
+	 * so that we can check for attempts to release dirty cached pages in
+	 * xfs_vm_releasepage().
+	 */
+	if (offset == 0 && length >= PAGE_SIZE)
+		cancel_dirty_page(page);
 	block_invalidatepage(page, offset, length);
 }
 
@@ -1190,25 +1198,27 @@ xfs_vm_releasepage(
 	 * mm accommodates an old ext3 case where clean pages might not have had
 	 * the dirty bit cleared. Thus, it can send actual dirty pages to
 	 * ->releasepage() via shrink_active_list(). Conversely,
-	 * block_invalidatepage() can send pages that are still marked dirty
-	 * but otherwise have invalidated buffers.
+	 * block_invalidatepage() can send pages that are still marked dirty but
+	 * otherwise have invalidated buffers.
 	 *
 	 * We want to release the latter to avoid unnecessary buildup of the
-	 * LRU, skip the former and warn if we've left any lingering
-	 * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
-	 * or unwritten buffers and warn if the page is not dirty. Otherwise
-	 * try to release the buffers.
+	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
+	 * that are entirely invalidated and need to be released.  Hence the
+	 * only time we should get dirty pages here is through
+	 * shrink_active_list() and so we can simply skip those now.
+	 *
+	 * warn if we've left any lingering delalloc/unwritten buffers on clean
+	 * or invalidated pages we are about to release.
 	 */
+	if (PageDirty(page))
+		return 0;
+
 	xfs_count_page_state(page, &delalloc, &unwritten);
 
-	if (delalloc) {
-		WARN_ON_ONCE(!PageDirty(page));
+	if (WARN_ON_ONCE(delalloc))
 		return 0;
-	}
-	if (unwritten) {
-		WARN_ON_ONCE(!PageDirty(page));
+	if (WARN_ON_ONCE(unwritten))
 		return 0;
-	}
 
 	return try_to_free_buffers(page);
 }
-- 
cgit v1.2.3-58-ga151


From 5e25c269e17de4c5a23ce886cda612b01365a944 Mon Sep 17 00:00:00 2001
From: Eryu Guan <eguan@redhat.com>
Date: Fri, 13 Oct 2017 09:47:46 -0700
Subject: fs: invalidate page cache after end_io() in dio completion

Commit 332391a9935d ("fs: Fix page cache inconsistency when mixing
buffered and AIO DIO") moved page cache invalidation from
iomap_dio_rw() to iomap_dio_complete() for iomap based direct write
path, but before the dio->end_io() call, and it re-introdued the bug
fixed by commit c771c14baa33 ("iomap: invalidate page caches should
be after iomap_dio_complete() in direct write").

I found this because fstests generic/418 started failing on XFS with
v4.14-rc3 kernel, which is the regression test for this specific
bug.

So similarly, fix it by moving dio->end_io() (which does the
unwritten extent conversion) before page cache invalidation, to make
sure next buffer read reads the final real allocations not unwritten
extents. I also add some comments about why should end_io() go first
in case we get it wrong again in the future.

Note that, there's no such problem in the non-iomap based direct
write path, because we didn't remove the page cache invalidation
after the ->direct_IO() in generic_file_direct_write() call, but I
decided to fix dio_complete() too so we don't leave a landmine
there, also be consistent with iomap_dio_complete().

Fixes: 332391a9935d ("fs: Fix page cache inconsistency when mixing buffered and AIO DIO")
Signed-off-by: Eryu Guan <eguan@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/direct-io.c | 20 ++++++++++++--------
 fs/iomap.c     | 41 ++++++++++++++++++++++++-----------------
 2 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 96415c65bbdc..19ac3fe57deb 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -259,12 +259,24 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 	if (ret == 0)
 		ret = transferred;
 
+	if (dio->end_io) {
+		// XXX: ki_pos??
+		err = dio->end_io(dio->iocb, offset, ret, dio->private);
+		if (err)
+			ret = err;
+	}
+
 	/*
 	 * Try again to invalidate clean pages which might have been cached by
 	 * non-direct readahead, or faulted in by get_user_pages() if the source
 	 * of the write was an mmap'ed region of the file we're writing.  Either
 	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
 	 * this invalidation fails, tough, the write still worked...
+	 *
+	 * And this page cache invalidation has to be after dio->end_io(), as
+	 * some filesystems convert unwritten extents to real allocations in
+	 * end_io() when necessary, otherwise a racing buffer read would cache
+	 * zeros from unwritten extents.
 	 */
 	if (ret > 0 && dio->op == REQ_OP_WRITE &&
 	    dio->inode->i_mapping->nrpages) {
@@ -274,14 +286,6 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 		WARN_ON_ONCE(err);
 	}
 
-	if (dio->end_io) {
-
-		// XXX: ki_pos??
-		err = dio->end_io(dio->iocb, offset, ret, dio->private);
-		if (err)
-			ret = err;
-	}
-
 	if (!(dio->flags & DIO_SKIP_DIO_COUNT))
 		inode_dio_end(dio->inode);
 
diff --git a/fs/iomap.c b/fs/iomap.c
index be61cf742b5e..d4801f8dd4fd 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -714,23 +714,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
 	struct kiocb *iocb = dio->iocb;
 	struct inode *inode = file_inode(iocb->ki_filp);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
-	/*
-	 * Try again to invalidate clean pages which might have been cached by
-	 * non-direct readahead, or faulted in by get_user_pages() if the source
-	 * of the write was an mmap'ed region of the file we're writing.  Either
-	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
-	 * this invalidation fails, tough, the write still worked...
-	 */
-	if (!dio->error &&
-	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
-		ret = invalidate_inode_pages2_range(inode->i_mapping,
-				iocb->ki_pos >> PAGE_SHIFT,
-				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
-	}
-
 	if (dio->end_io) {
 		ret = dio->end_io(iocb,
 				dio->error ? dio->error : dio->size,
@@ -742,12 +728,33 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
 	if (likely(!ret)) {
 		ret = dio->size;
 		/* check for short read */
-		if (iocb->ki_pos + ret > dio->i_size &&
+		if (offset + ret > dio->i_size &&
 		    !(dio->flags & IOMAP_DIO_WRITE))
-			ret = dio->i_size - iocb->ki_pos;
+			ret = dio->i_size - offset;
 		iocb->ki_pos += ret;
 	}
 
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 *
+	 * And this page cache invalidation has to be after dio->end_io(), as
+	 * some filesystems convert unwritten extents to real allocations in
+	 * end_io() when necessary, otherwise a racing buffer read would cache
+	 * zeros from unwritten extents.
+	 */
+	if (!dio->error &&
+	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+		int err;
+		err = invalidate_inode_pages2_range(inode->i_mapping,
+				offset >> PAGE_SHIFT,
+				(offset + dio->size - 1) >> PAGE_SHIFT);
+		WARN_ON_ONCE(err);
+	}
+
 	inode_dio_end(file_inode(iocb->ki_filp));
 	kfree(dio);
 
-- 
cgit v1.2.3-58-ga151


From 40214d128e07dd21bb07a8ed6a7fe2f911281ab2 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 13 Oct 2017 09:47:46 -0700
Subject: xfs: trim writepage mapping to within eof

The writeback rework in commit fbcc02561359 ("xfs: Introduce
writeback context for writepages") introduced a subtle change in
behavior with regard to the block mapping used across the
->writepages() sequence. The previous xfs_cluster_write() code would
only flush pages up to EOF at the time of the writepage, thus
ensuring that any pages due to file-extending writes would be
handled on a separate cycle and with a new, updated block mapping.

The updated code establishes a block mapping in xfs_writepage_map()
that could extend beyond EOF if the file has post-eof preallocation.
Because we now use the generic writeback infrastructure and pass the
cached mapping to each writepage call, there is no implicit EOF
limit in place. If eofblocks trimming occurs during ->writepages(),
any post-eof portion of the cached mapping becomes invalid. The
eofblocks code has no means to serialize against writeback because
there are no pages associated with post-eof blocks. Therefore if an
eofblocks trim occurs and is followed by a file-extending buffered
write, not only has the mapping become invalid, but we could end up
writing a page to disk based on the invalid mapping.

Consider the following sequence of events:

- A buffered write creates a delalloc extent and post-eof
  speculative preallocation.
- Writeback starts and on the first writepage cycle, the delalloc
  extent is converted to real blocks (including the post-eof blocks)
  and the mapping is cached.
- The file is closed and xfs_release() trims post-eof blocks. The
  cached writeback mapping is now invalid.
- Another buffered write appends the file with a delalloc extent.
- The concurrent writeback cycle picks up the just written page
  because the writeback range end is LLONG_MAX. xfs_writepage_map()
  attributes it to the (now invalid) cached mapping and writes the
  data to an incorrect location on disk (and where the file offset is
  still backed by a delalloc extent).

This problem is reproduced by xfstests test generic/464, which
triggers racing writes, appends, open/closes and writeback requests.

To address this problem, trim the mapping used during writeback to
within EOF when the mapping is validated. This ensures the mapping
is revalidated for any pages encountered beyond EOF as of the time
the current mapping was cached or last validated.

Reported-by: Eryu Guan <eguan@redhat.com>
Diagnosed-by: Eryu Guan <eguan@redhat.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 11 +++++++++++
 fs/xfs/libxfs/xfs_bmap.h |  1 +
 fs/xfs/xfs_aops.c        | 13 +++++++++++++
 3 files changed, 25 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index def32fa1c225..89263797cf32 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3852,6 +3852,17 @@ xfs_trim_extent(
 	}
 }
 
+/* trim extent to within eof */
+void
+xfs_trim_extent_eof(
+	struct xfs_bmbt_irec	*irec,
+	struct xfs_inode	*ip)
+
+{
+	xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
+					      i_size_read(VFS_I(ip))));
+}
+
 /*
  * Trim the returned map to the required bounds
  */
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 851982a5dfbc..502e0d8fb4ff 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -208,6 +208,7 @@ void	xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
 
 void	xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
 		xfs_filblks_t len);
+void	xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
 int	xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
 void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
 void	xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 067284d84d9e..a3eeaba156c5 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -446,6 +446,19 @@ xfs_imap_valid(
 {
 	offset >>= inode->i_blkbits;
 
+	/*
+	 * We have to make sure the cached mapping is within EOF to protect
+	 * against eofblocks trimming on file release leaving us with a stale
+	 * mapping. Otherwise, a page for a subsequent file extending buffered
+	 * write could get picked up by this writeback cycle and written to the
+	 * wrong blocks.
+	 *
+	 * Note that what we really want here is a generic mapping invalidation
+	 * mechanism to protect us from arbitrary extent modifying contexts, not
+	 * just eofblocks.
+	 */
+	xfs_trim_extent_eof(imap, XFS_I(inode));
+
 	return offset >= imap->br_startoff &&
 		offset < imap->br_startoff + imap->br_blockcount;
 }
-- 
cgit v1.2.3-58-ga151


From 785545c8982604fe3ba79d16409e83993be77d5e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Oct 2017 09:47:47 -0700
Subject: xfs: move two more RT specific functions into CONFIG_XFS_RT

The last cleanup introduced two harmless warnings:

fs/xfs/xfs_fsmap.c:480:1: warning: '__xfs_getfsmap_rtdev' defined but not used
fs/xfs/xfs_fsmap.c:372:1: warning: 'xfs_getfsmap_rtdev_rtbitmap_helper' defined but not used

This moves those two functions as well.

Fixes: bb9c2e543325 ("xfs: move more RT specific code under CONFIG_XFS_RT")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsmap.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 560e0b40ac1b..43cfc07996a4 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -367,29 +367,6 @@ xfs_getfsmap_datadev_helper(
 	return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
 }
 
-/* Transform a rtbitmap "record" into a fsmap */
-STATIC int
-xfs_getfsmap_rtdev_rtbitmap_helper(
-	struct xfs_trans		*tp,
-	struct xfs_rtalloc_rec		*rec,
-	void				*priv)
-{
-	struct xfs_mount		*mp = tp->t_mountp;
-	struct xfs_getfsmap_info	*info = priv;
-	struct xfs_rmap_irec		irec;
-	xfs_daddr_t			rec_daddr;
-
-	rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
-
-	irec.rm_startblock = rec->ar_startblock;
-	irec.rm_blockcount = rec->ar_blockcount;
-	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
-	irec.rm_offset = 0;
-	irec.rm_flags = 0;
-
-	return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
-}
-
 /* Transform a bnobt irec into a fsmap */
 STATIC int
 xfs_getfsmap_datadev_bnobt_helper(
@@ -475,6 +452,30 @@ xfs_getfsmap_logdev(
 	return xfs_getfsmap_helper(tp, info, &rmap, 0);
 }
 
+#ifdef CONFIG_XFS_RT
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+	struct xfs_trans		*tp,
+	struct xfs_rtalloc_rec		*rec,
+	void				*priv)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_getfsmap_info	*info = priv;
+	struct xfs_rmap_irec		irec;
+	xfs_daddr_t			rec_daddr;
+
+	rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
+
+	irec.rm_startblock = rec->ar_startblock;
+	irec.rm_blockcount = rec->ar_blockcount;
+	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
+	irec.rm_offset = 0;
+	irec.rm_flags = 0;
+
+	return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
+
 /* Execute a getfsmap query against the realtime device. */
 STATIC int
 __xfs_getfsmap_rtdev(
@@ -521,7 +522,6 @@ __xfs_getfsmap_rtdev(
 	return query_fn(tp, info);
 }
 
-#ifdef CONFIG_XFS_RT
 /* Actually query the realtime bitmap. */
 STATIC int
 xfs_getfsmap_rtdev_rtbitmap_query(
-- 
cgit v1.2.3-58-ga151