diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-25 15:46:04 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-09-25 15:46:04 -0700 |
commit | 19240e6b2a6c456d1c2975400a04b6b01f957cf0 (patch) | |
tree | 989908a19a86e2cbfe70d71d18da121b907b8738 /fs | |
parent | 17763641ff8a189bc6fb1fcd5c2a9844ee0dd89c (diff) | |
parent | fddc9923c6d41de9fe7b1f323a3cece53e046c88 (diff) |
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
- Two sets of NVMe pull requests from Christoph:
- Fixes for the Fibre Channel host/target to fix spec compliance
- Allow a zero keep alive timeout
- Make the debug printk for broken SGLs work better
- Fix queue zeroing during initialization
- Set of RDMA and FC fixes
- Target div-by-zero fix
- bsg double-free fix.
- ndb unknown ioctl fix from Josef.
- Buffered vs O_DIRECT page cache inconsistency fix. Has been floating
around for a long time, well reviewed. From Lukas.
- brd overflow fix from Mikulas.
- Fix for a loop regression in this merge window, where using a union
for two members of the loop_cmd turned out to be a really bad idea.
From Omar.
- Fix for an iostat regression fix in this series, using the wrong API
to get at the block queue. From Shaohua.
- Fix for a potential blktrace delection deadlock. From Waiman.
* 'for-linus' of git://git.kernel.dk/linux-block: (30 commits)
nvme-fcloop: fix port deletes and callbacks
nvmet-fc: sync header templates with comments
nvmet-fc: ensure target queue id within range.
nvmet-fc: on port remove call put outside lock
nvme-rdma: don't fully stop the controller in error recovery
nvme-rdma: give up reconnect if state change fails
nvme-core: Use nvme_wq to queue async events and fw activation
nvme: fix sqhd reference when admin queue connect fails
block: fix a crash caused by wrong API
fs: Fix page cache inconsistency when mixing buffered and AIO DIO
nvmet: implement valid sqhd values in completions
nvme-fabrics: Allow 0 as KATO value
nvme: allow timed-out ios to retry
nvme: stop aer posting if controller state not live
nvme-pci: Print invalid SGL only once
nvme-pci: initialize queue memory before interrupts
nvmet-fc: fix failing max io queue connections
nvme-fc: use transport-specific sgl format
nvme: add transport SGL definitions
nvme.h: remove FC transport-specific error values
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/direct-io.c | 49 | ||||
-rw-r--r-- | fs/iomap.c | 29 |
2 files changed, 59 insertions, 19 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index 5fa2211e49ae..62cf812ed0e5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) { loff_t offset = dio->iocb->ki_pos; ssize_t transferred = 0; + int err; /* * AIO submission can race with bio completion to get here while @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (ret == 0) ret = transferred; + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (ret > 0 && dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages) { + err = invalidate_inode_pages2_range(dio->inode->i_mapping, + offset >> PAGE_SHIFT, + (offset + ret - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(err); + } + if (dio->end_io) { - int err; // XXX: ki_pos?? err = dio->end_io(dio->iocb, offset, ret, dio->private); @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio) struct dio *dio = bio->bi_private; unsigned long remaining; unsigned long flags; + bool defer_completion = false; /* cleanup the bio */ dio_bio_complete(dio, bio); @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - if (dio->result && dio->defer_completion) { + /* + * Defer completion when defer_completion is set or + * when the inode has pages mapped and this is AIO write. + * We need to invalidate those pages because there is a + * chance they contain stale data in the case buffered IO + * went in between AIO submission and completion into the + * same region. + */ + if (dio->result) + defer_completion = dio->defer_completion || + (dio->op == REQ_OP_WRITE && + dio->inode->i_mapping->nrpages); + if (defer_completion) { INIT_WORK(&dio->complete_work, dio_aio_complete_work); queue_work(dio->inode->i_sb->s_dio_done_wq, &dio->complete_work); @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * For AIO O_(D)SYNC writes we need to defer completions to a workqueue * so that we can call ->fsync. */ - if (dio->is_async && iov_iter_rw(iter) == WRITE && - ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host))) { - retval = dio_set_defer_completion(dio); + if (dio->is_async && iov_iter_rw(iter) == WRITE) { + retval = 0; + if ((iocb->ki_filp->f_flags & O_DSYNC) || + IS_SYNC(iocb->ki_filp->f_mapping->host)) + retval = dio_set_defer_completion(dio); + else if (!dio->inode->i_sb->s_dio_done_wq) { + /* + * In case of AIO write racing with buffered read we + * need to defer completion. We can't decide this now, + * however the workqueue needs to be initialized here. + */ + retval = sb_init_dio_done_wq(dio->inode->i_sb); + } if (retval) { /* * We grab i_mutex only for reads so we don't have diff --git a/fs/iomap.c b/fs/iomap.c index 269b24a01f32..8194d30bdca0 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -713,8 +713,24 @@ struct iomap_dio { static ssize_t iomap_dio_complete(struct iomap_dio *dio) { struct kiocb *iocb = dio->iocb; + struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (!dio->error && + (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + if (dio->end_io) { ret = dio->end_io(iocb, dio->error ? dio->error : dio->size, @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ret = iomap_dio_complete(dio); - /* - * Try again to invalidate clean pages which might have been cached by - * non-direct readahead, or faulted in by get_user_pages() if the source - * of the write was an mmap'ed region of the file we're writing. Either - * one is a pretty crazy thing to do, so we don't support it 100%. If - * this invalidation fails, tough, the write still worked... - */ - if (iov_iter_rw(iter) == WRITE) { - int err = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(err); - } - return ret; out_free_dio: |