From 9b038d004ce95551cb35381c49fe896c5bc11ffe Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 14:37:43 +0100 Subject: netfs: Fix io_uring based write-through This can be triggered by mounting a cifs filesystem with a cache=strict mount option and then, using the fsx program from xfstests, doing: ltp/fsx -A -d -N 1000 -S 11463 -P /tmp /cifs-mount/foo \ --replay-ops=gen112-fsxops Where gen112-fsxops holds: fallocate 0x6be7 0x8fc5 0x377d3 copy_range 0x9c71 0x77e8 0x2edaf 0x377d3 write 0x2776d 0x8f65 0x377d3 The problem is that netfs_io_request::len is being used for two purposes and ends up getting set to the amount of data we transferred, not the amount of data the caller asked to be transferred (for various reasons, such as mmap'd writes, we might end up rounding out the data written to the server to include the entire folio at each end). Fix this by keeping the amount we were asked to write in ->len and using ->submitted to track what we issued ops for. Then, when we come to calling ->ki_complete(), ->len is the right size. This also required netfs_cleanup_dio_write() to change since we're no longer advancing wreq->len. Use wreq->transferred instead as we might have done a short read. With this, the generic/112 xfstest passes if cifs is forced to put all non-DIO opens into write-through mode. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Signed-off-by: David Howells Link: https://lore.kernel.org/r/295086.1716298663@warthog.procyon.org.uk cc: Jeff Layton cc: Steve French cc: Enzo Matsumiya cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/direct_write.c | 2 +- fs/netfs/write_collect.c | 7 ++++--- fs/netfs/write_issue.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 608ba6416919..28163516bf03 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -12,7 +12,7 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) { struct inode *inode = wreq->inode; - unsigned long long end = wreq->start + wreq->len; + unsigned long long end = wreq->start + wreq->transferred; if (!wreq->error && i_size_read(inode) < end) { diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 60112e4b2c5e..426cf87aaf2e 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -510,7 +510,7 @@ reassess_streams: * stream has a gap that can be jumped. */ if (notes & SOME_EMPTY) { - unsigned long long jump_to = wreq->start + wreq->len; + unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; @@ -690,10 +690,11 @@ void netfs_write_collection_worker(struct work_struct *work) wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); if (wreq->iocb) { - wreq->iocb->ki_pos += wreq->transferred; + size_t written = min(wreq->transferred, wreq->len); + wreq->iocb->ki_pos += written; if (wreq->iocb->ki_complete) wreq->iocb->ki_complete( - wreq->iocb, wreq->error ? wreq->error : wreq->transferred); + wreq->iocb, wreq->error ? wreq->error : written); wreq->iocb = VFS_PTR_POISON; } diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index e190043bc0da..86dad7e4202b 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -254,7 +254,7 @@ static void netfs_issue_write(struct netfs_io_request *wreq, stream->construct = NULL; if (subreq->start + subreq->len > wreq->start + wreq->submitted) - wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start; + WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); netfs_do_issue_write(stream, subreq); } -- cgit v1.2.3-58-ga151 From 2c6b531020f0590db3b6b4950a41c692e9aa4f4a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 14:36:27 +0100 Subject: netfs: Fix AIO error handling when doing write-through If an error occurs whilst we're doing an AIO write in write-through mode, we may end up calling ->ki_complete() *and* returning an error from ->write_iter(). This can result in either a UAF (the ->ki_complete() func pointer may get overwritten, for example) or a refcount underflow in io_submit() as ->ki_complete is called twice. Fix this by making netfs_end_writethrough() - and thus netfs_perform_write() - unconditionally return -EIOCBQUEUED if we're doing an AIO write and wait for completion if we're not. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Signed-off-by: David Howells Link: https://lore.kernel.org/r/295052.1716298587@warthog.procyon.org.uk cc: Jeff Layton cc: Enzo Matsumiya cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/write_issue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 86dad7e4202b..3aa86e268f40 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -636,7 +636,12 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr mutex_unlock(&ictx->wb_lock); - ret = wreq->error; + if (wreq->iocb) { + ret = -EIOCBQUEUED; + } else { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } netfs_put_request(wreq, false, netfs_rreq_trace_put_return); return ret; } -- cgit v1.2.3-58-ga151 From 4e527d5841e24623181edc7fd6f6598ffa810e10 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 21 May 2024 19:49:39 +0800 Subject: iomap: fault in smaller chunks for non-large folio mappings Since commit (5d8edfb900d5 "iomap: Copy larger chunks from userspace"), iomap will try to copy in larger chunks than PAGE_SIZE. However, if the mapping doesn't support large folio, only one page of maximum 4KB will be created and 4KB data will be writen to pagecache each time. Then, next 4KB will be handled in next iteration. This will cause potential write performance problem. If chunk is 2MB, total 512 pages need to be handled finally. During this period, fault_in_iov_iter_readable() is called to check iov_iter readable validity. Since only 4KB will be handled each time, below address space will be checked over and over again: start end - buf, buf+2MB buf+4KB, buf+2MB buf+8KB, buf+2MB ... buf+2044KB buf+2MB Obviously the checking size is wrong since only 4KB will be handled each time. So this will get a correct chunk to let iomap work well in non-large folio case. With this change, the write speed will be stable. Tested on ARM64 device. Before: - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s) - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s) - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s) - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s) - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s) - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s) After: - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s) - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s) - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s) - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s) - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s) - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s) Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20240521114939.2541461-2-xu.yang_2@nxp.com Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 41c8f0c68ef5..c5802a459334 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -898,11 +898,11 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); - size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; loff_t pos = iter->pos; ssize_t total_written = 0; long status = 0; struct address_space *mapping = iter->inode->i_mapping; + size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { -- cgit v1.2.3-58-ga151 From f826bc9d6fc2f0e089fb8d104415d72e4d2e204c Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 20 May 2024 12:08:18 +0300 Subject: signalfd: fix error return code If anon_inode_getfile() fails, return appropriate error code. This looks like a single typo: the similar code changes in timerfd and userfaultfd are okay. Found by Linux Verification Center (linuxtesting.org). Fixes: fbe38120eb1d ("signalfd: convert to ->read_iter()") Signed-off-by: Fedor Pchelkin Link: https://lore.kernel.org/r/20240520090819.76342-1-pchelkin@ispras.ru Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/signalfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/signalfd.c b/fs/signalfd.c index 4a5614442dbf..65fe5eed0be4 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -282,7 +282,7 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); - return ufd; + return PTR_ERR(file); } file->f_mode |= FMODE_NOWAIT; -- cgit v1.2.3-58-ga151 From 65bea9953715b19371164a8bec4f74fdd22c9e5a Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 20 May 2024 12:08:19 +0300 Subject: signalfd: drop an obsolete comment Commit fbe38120eb1d ("signalfd: convert to ->read_iter()") removed the call to anon_inode_getfd() by splitting fd setup into two parts. Drop the comment referencing the internal details of that function. Signed-off-by: Fedor Pchelkin Link: https://lore.kernel.org/r/20240520090819.76342-2-pchelkin@ispras.ru Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/signalfd.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/signalfd.c b/fs/signalfd.c index 65fe5eed0be4..ec7b2da2477a 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -286,10 +286,6 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) } file->f_mode |= FMODE_NOWAIT; - /* - * When we call this, the initialization must be complete, since - * anon_inode_getfd() will install the fd. - */ fd_install(ufd, file); } else { struct fd f = fdget(ufd); -- cgit v1.2.3-58-ga151 From c596bea1452ddf172ec9b588e4597228e9a1f4d5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 May 2024 16:49:46 +0100 Subject: netfs: Fix setting of BDP_ASYNC from iocb flags Fix netfs_perform_write() to set BDP_ASYNC if IOCB_NOWAIT is set rather than if IOCB_SYNC is not set. It reflects asynchronicity in the sense of not waiting rather than synchronicity in the sense of not returning until the op is complete. Without this, generic/590 fails on cifs in strict caching mode with a complaint that one of the writes fails with EAGAIN. The test can be distilled down to: mount -t cifs /my/share /mnt -ostuff xfs_io -i -c 'falloc 0 8191M -c fsync -f /mnt/file xfs_io -i -c 'pwrite -b 1M -W 0 8191M' /mnt/file Fixes: c38f4e96e605 ("netfs: Provide func to copy data to pagecache for buffered write") Signed-off-by: David Howells Link: https://lore.kernel.org/r/316306.1716306586@warthog.procyon.org.uk Reviewed-by: Jens Axboe cc: Jeff Layton cc: Enzo Matsumiya cc: Jens Axboe cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 1121601536d1..07bc1fd43530 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -181,7 +181,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio, *writethrough = NULL; enum netfs_how_to_modify howto; enum netfs_folio_trace trace; - unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; loff_t i_size, pos = iocb->ki_pos, from, to; size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; -- cgit v1.2.3-58-ga151 From 29be9100aca2915fab54b5693309bc42956542e5 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Fri, 24 May 2024 17:17:55 +0100 Subject: afs: Don't cross .backup mountpoint from backup volume Don't cross a mountpoint that explicitly specifies a backup volume (target is .backup) when starting from a backup volume. It it not uncommon to mount a volume's backup directly in the volume itself. This can cause tools that are not paying attention to get into a loop mounting the volume onto itself as they attempt to traverse the tree, leading to a variety of problems. This doesn't prevent the general case of loops in a sequence of mountpoints, but addresses a common special case in the same way as other afs clients. Reported-by: Jan Henrik Sylvester Link: http://lists.infradead.org/pipermail/linux-afs/2024-May/008454.html Reported-by: Markus Suvanto Link: http://lists.infradead.org/pipermail/linux-afs/2024-February/008074.html Signed-off-by: Marc Dionne Signed-off-by: David Howells Link: https://lore.kernel.org/r/768760.1716567475@warthog.procyon.org.uk Reviewed-by: Jeffrey Altman cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- fs/afs/mntpt.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 97f50e9fd9eb..297487ee8323 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -140,6 +140,11 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) put_page(page); if (ret < 0) return ret; + + /* Don't cross a backup volume mountpoint from a backup volume */ + if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL && + ctx->type == AFSVL_BACKVOL) + return -ENODEV; } return 0; -- cgit v1.2.3-58-ga151 From f89ea63f1c65d3e93b255f14f9d9e05df87955fa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 May 2024 15:26:11 +0100 Subject: netfs, 9p: Fix race between umount and async request completion There's a problem in 9p's interaction with netfslib whereby a crash occurs because the 9p_fid structs get forcibly destroyed during client teardown (without paying attention to their refcounts) before netfslib has finished with them. However, it's not a simple case of deferring the clunking that p9_fid_put() does as that requires the p9_client record to still be present. The problem is that netfslib has to unlock pages and clear the IN_PROGRESS flag before destroying the objects involved - including the fid - and, in any case, nothing checks to see if writeback completed barring looking at the page flags. Fix this by keeping a count of outstanding I/O requests (of any type) and waiting for it to quiesce during inode eviction. Reported-by: syzbot+df038d463cca332e8414@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005be0aa061846f8d6@google.com/ Reported-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000b86c5e06130da9c6@google.com/ Reported-by: syzbot+1527696d41a634cc1819@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000041f960618206d7e@google.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/755891.1716560771@warthog.procyon.org.uk Tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Reviewed-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: Jeff Layton cc: Steve French cc: Hillf Danton cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-and-tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_inode.c | 1 + fs/afs/inode.c | 1 + fs/netfs/objects.c | 5 +++++ fs/smb/client/cifsfs.c | 1 + include/linux/netfs.h | 18 ++++++++++++++++++ 5 files changed, 26 insertions(+) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7a3308d77606..fd72fc38c8f5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode) __le32 __maybe_unused version; if (!is_bad_inode(inode)) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); version = cpu_to_le32(v9inode->qid.version); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 94fc049aff58..15bb7989c387 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c90d482b1650..f4a642727479 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); @@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); @@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); call_rcu(&rreq->rcu, netfs_free_request_rcu); } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index ec5b639f421a..14810ffd15c8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ca56a4428043..3b22ce0d064c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -68,6 +68,7 @@ struct netfs_inode { loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ + atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ @@ -472,6 +473,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, ctx->remote_i_size = i_size_read(&ctx->inode); ctx->zero_point = LLONG_MAX; ctx->flags = 0; + atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif @@ -515,4 +517,20 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) #endif } +/** + * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete + * @ctx: The netfs inode to wait on + * + * Wait for outstanding I/O requests of any type to complete. This is intended + * to be called from inode eviction routines. This makes sure that any + * resources held by those requests are cleaned up before we let the inode get + * cleaned up. + */ +static inline void netfs_wait_for_outstanding_io(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); +} + #endif /* _LINUX_NETFS_H */ -- cgit v1.2.3-58-ga151