From fc1f91b9231a28fba333f931a031bf776bc6ef0e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 21 Jul 2023 16:09:43 -0400 Subject: btrfs: wait for actual caching progress during allocation Recently we've been having mysterious hangs while running generic/475 on the CI system. This turned out to be something like this: Task 1 dmsetup suspend --nolockfs -> __dm_suspend -> dm_wait_for_completion -> dm_wait_for_bios_completion -> Unable to complete because of IO's on a plug in Task 2 Task 2 wb_workfn -> wb_writeback -> blk_start_plug -> writeback_sb_inodes -> Infinite loop unable to make an allocation Task 3 cache_block_group ->read_extent_buffer_pages ->Waiting for IO to complete that can't be submitted because Task 1 suspended the DM device The problem here is that we need Task 2 to be scheduled completely for the blk plug to flush. Normally this would happen, we normally wait for the block group caching to finish (Task 3), and this schedule would result in the block plug flushing. However if there's enough free space available from the current caching to satisfy the allocation we won't actually wait for the caching to complete. This check however just checks that we have enough space, not that we can make the allocation. In this particular case we were trying to allocate 9MiB, and we had 10MiB of free space, but we didn't have 9MiB of contiguous space to allocate, and thus the allocation failed and we looped. We specifically don't cycle through the FFE loop until we stop finding cached block groups because we don't want to allocate new block groups just because we're caching, so we short circuit the normal loop once we hit LOOP_CACHING_WAIT and we found a caching block group. This is normally fine, except in this particular case where the caching thread can't make progress because the DM device has been suspended. Fix this by not only waiting for free space to >= the amount of space we want to allocate, but also that we make some progress in caching from the time we start waiting. This will keep us from busy looping when the caching is taking a while but still theoretically has enough space for us to allocate from, and fixes this particular case by forcing us to actually sleep and wait for forward progress, which will flush the plug. With this fix we're no longer hanging with generic/475. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Boris Burkov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 17 +++++++++++++++-- fs/btrfs/block-group.h | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 63c3b7172ba5..1e4b70f5280d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -441,13 +441,23 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; + int progress; caching_ctl = btrfs_get_caching_control(cache); if (!caching_ctl) return; + /* + * We've already failed to allocate from this block group, so even if + * there's enough space in the block group it isn't contiguous enough to + * allow for an allocation, so wait for at least the next wakeup tick, + * or for the thing to be done. + */ + progress = atomic_read(&caching_ctl->progress); + wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || - (cache->free_space_ctl->free_space >= num_bytes)); + (progress != atomic_read(&caching_ctl->progress) && + (cache->free_space_ctl->free_space >= num_bytes))); btrfs_put_caching_control(caching_ctl); } @@ -802,8 +812,10 @@ next: if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; - if (wakeup) + if (wakeup) { + atomic_inc(&caching_ctl->progress); wake_up(&caching_ctl->wait); + } } } path->slots[0]++; @@ -910,6 +922,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; refcount_set(&caching_ctl->count, 2); + atomic_set(&caching_ctl->progress, 0); btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); spin_lock(&cache->lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index aba5dff66c19..74b61e663028 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -90,6 +90,8 @@ struct btrfs_caching_control { wait_queue_head_t wait; struct btrfs_work work; struct btrfs_block_group *block_group; + /* Track progress of caching during allocation. */ + atomic_t progress; refcount_t count; }; -- cgit v1.2.3-58-ga151 From effa24f689ce0948f68c754991a445a8d697d3a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 06:26:53 -0700 Subject: btrfs: don't stop integrity writeback too early extent_write_cache_pages stops writing pages as soon as nr_to_write hits zero. That is the right thing for opportunistic writeback, but incorrect for data integrity writeback, which needs to ensure that no dirty pages are left in the range. Thus only stop the writeback for WB_SYNC_NONE if nr_to_write hits 0. This is a port of write_cache_pages changes in commit 05fe478dd04e ("mm: write_cache_pages integrity fix"). Note that I've only trigger the problem with other changes to the btrfs writeback code, but this condition seems worthwhile fixing anyway. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba [ updated comment ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a91d5ad27984..c36eb4956f81 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2164,11 +2164,12 @@ retry: } /* - * the filesystem may choose to bump up nr_to_write. + * The filesystem may choose to bump up nr_to_write. * We have to make sure to honor the new nr_to_write - * at any time + * at any time. */ - nr_to_write_done = wbc->nr_to_write <= 0; + nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE && + wbc->nr_to_write <= 0); } folio_batch_release(&fbatch); cond_resched(); -- cgit v1.2.3-58-ga151 From 5c25699871112853f231e52d51c576d5c759a020 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 06:26:54 -0700 Subject: btrfs: don't wait for writeback on clean pages in extent_write_cache_pages __extent_writepage could have started on more pages than the one it was called for. This happens regularly for zoned file systems, and in theory could happen for compressed I/O if the worker thread was executed very quickly. For such pages extent_write_cache_pages waits for writeback to complete before moving on to the next page, which is highly inefficient as it blocks the flusher thread. Port over the PageDirty check that was added to write_cache_pages in commit 515f4a037fb ("mm: write_cache_pages optimise page cleaning") to fix this. CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c36eb4956f81..ca765d62324f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2145,6 +2145,12 @@ retry: continue; } + if (!folio_test_dirty(folio)) { + /* Someone wrote it for us. */ + folio_unlock(folio); + continue; + } + if (wbc->sync_mode != WB_SYNC_NONE) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); -- cgit v1.2.3-58-ga151 From 12b2d64e591652a2d97dd3afa2b062ca7a4ba352 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jul 2023 06:26:55 -0700 Subject: btrfs: properly clear end of the unreserved range in cow_file_range When the call to btrfs_reloc_clone_csums in cow_file_range returns an error, we jump to the out_unlock label with the extent_reserved variable set to false. The cleanup at the label will then call extent_clear_unlock_delalloc on the range from start to end. But we've already added cur_alloc_size to start before the jump, so there might no range be left from the newly incremented start to end. Move the check for 'start < end' so that it is reached by also for the !extent_reserved case. CC: stable@vger.kernel.org # 6.1+ Fixes: a315e68f6e8b ("Btrfs: fix invalid attempt to free reserved space on failure to cow range") Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 49cef61f6a39..9055e19b01ef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1654,8 +1654,6 @@ out_unlock: clear_bits, page_ops); start += cur_alloc_size; - if (start >= end) - return ret; } /* @@ -1664,9 +1662,11 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits | EXTENT_CLEAR_DATA_RESV, - page_ops); + if (start < end) { + clear_bits |= EXTENT_CLEAR_DATA_RESV; + extent_clear_unlock_delalloc(inode, start, end, locked_page, + clear_bits, page_ops); + } return ret; } -- cgit v1.2.3-58-ga151 From 773e722a98e25caf96f91aced7070c1858250ba2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 3 Aug 2023 17:20:41 +0800 Subject: btrfs: avoid race between qgroup tree creation and relocation [BUG] Syzbot reported a weird ASSERT() triggered inside prepare_to_merge(). assertion failed: root->reloc_root == reloc_root, in fs/btrfs/relocation.c:1919 ------------[ cut here ]------------ kernel BUG at fs/btrfs/relocation.c:1919! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 9904 Comm: syz-executor.3 Not tainted 6.4.0-syzkaller-08881-g533925cb7604 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/27/2023 RIP: 0010:prepare_to_merge+0xbb2/0xc40 fs/btrfs/relocation.c:1919 Code: fe e9 f5 (...) RSP: 0018:ffffc9000325f760 EFLAGS: 00010246 RAX: 000000000000004f RBX: ffff888075644030 RCX: 1481ccc522da5800 RDX: ffffc90005c09000 RSI: 00000000000364ca RDI: 00000000000364cb RBP: ffffc9000325f870 R08: ffffffff816f33ac R09: 1ffff9200064bea0 R10: dffffc0000000000 R11: fffff5200064bea1 R12: ffff888075644000 R13: ffff88803b166000 R14: ffff88803b166560 R15: ffff88803b166558 FS: 00007f4e305fd700(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056080679c000 CR3: 00000000193ad000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: relocate_block_group+0xa5d/0xcd0 fs/btrfs/relocation.c:3749 btrfs_relocate_block_group+0x7ab/0xd70 fs/btrfs/relocation.c:4087 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3283 __btrfs_balance+0x1b06/0x2690 fs/btrfs/volumes.c:4018 btrfs_balance+0xbdb/0x1120 fs/btrfs/volumes.c:4402 btrfs_ioctl_balance+0x496/0x7c0 fs/btrfs/ioctl.c:3604 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl+0xf8/0x170 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f4e2f88c389 [CAUSE] With extra debugging, the offending reloc_root is for quota tree (rootid 8). Normally we should not use the reloc tree for quota root at all, as reloc trees are only for subvolume trees. But there is a race between quota enabling and relocation, this happens after commit 85724171b302 ("btrfs: fix the btrfs_get_global_root return value"). Before that commit, for quota and free space tree, we exit immediately if we cannot grab it from fs_info. But now we would try to read it from disk, just as if they are fs trees, this sets ROOT_SHAREABLE flags in such race: Thread A | Thread B ---------------------------------+------------------------------ btrfs_quota_enable() | | | btrfs_get_root_ref() | | |- btrfs_get_global_root() | | | Returned NULL | | |- btrfs_lookup_fs_root() | | | Returned NULL |- btrfs_create_tree() | | | Now quota root item is | | | inserted | |- btrfs_read_tree_root() | | | Got the newly inserted quota root | | |- btrfs_init_fs_root() | | | Set ROOT_SHAREABLE flag [FIX] Get back to the old behavior by returning PTR_ERR(-ENOENT) if the target objectid is not a subvolume tree or data reloc tree. Reported-and-tested-by: syzbot+ae97a827ae1c3336bbb4@syzkaller.appspotmail.com Fixes: 85724171b302 ("btrfs: fix the btrfs_get_global_root return value") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9b9914e5f03d..11b1ac716f3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1300,6 +1300,16 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, root = btrfs_get_global_root(fs_info, objectid); if (root) return root; + + /* + * If we're called for non-subvolume trees, and above function didn't + * find one, do not try to read it from disk. + * + * This is namely for free-space-tree and quota tree, which can change + * at runtime and should only be grabbed from fs_info. + */ + if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + return ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { -- cgit v1.2.3-58-ga151 From 05d7ce504545f7874529701664c90814ca645c5d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 3 Aug 2023 17:20:42 +0800 Subject: btrfs: exit gracefully if reloc roots don't match [BUG] Syzbot reported a crash that an ASSERT() got triggered inside prepare_to_merge(). [CAUSE] The root cause of the triggered ASSERT() is we can have a race between quota tree creation and relocation. This leads us to create a duplicated quota tree in the btrfs_read_fs_root() path, and since it's treated as fs tree, it would have ROOT_SHAREABLE flag, causing us to create a reloc tree for it. The bug itself is fixed by a dedicated patch for it, but this already taught us the ASSERT() is not something straightforward for developers. [ENHANCEMENT] Instead of using an ASSERT(), let's handle it gracefully and output extra info about the mismatch reloc roots to help debug. Also with the above ASSERT() removed, we can trigger ASSERT(0)s inside merge_reloc_roots() later. Also replace those ASSERT(0)s with WARN_ON()s. CC: stable@vger.kernel.org # 5.15+ Reported-by: syzbot+ae97a827ae1c3336bbb4@syzkaller.appspotmail.com Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 25a3361caedc..46c3c1d57266 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1916,7 +1916,39 @@ again: err = PTR_ERR(root); break; } - ASSERT(root->reloc_root == reloc_root); + + if (unlikely(root->reloc_root != reloc_root)) { + if (root->reloc_root) { + btrfs_err(fs_info, +"reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", + root->root_key.objectid, + root->reloc_root->root_key.objectid, + root->reloc_root->root_key.type, + root->reloc_root->root_key.offset, + btrfs_root_generation( + &root->reloc_root->root_item), + reloc_root->root_key.objectid, + reloc_root->root_key.type, + reloc_root->root_key.offset, + btrfs_root_generation( + &reloc_root->root_item)); + } else { + btrfs_err(fs_info, +"reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", + root->root_key.objectid, + reloc_root->root_key.objectid, + reloc_root->root_key.type, + reloc_root->root_key.offset, + btrfs_root_generation( + &reloc_root->root_item)); + } + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(root); + btrfs_abort_transaction(trans, -EUCLEAN); + if (!err) + err = -EUCLEAN; + break; + } /* * set reference count to 1, so btrfs_recover_relocation @@ -1989,7 +2021,7 @@ again: root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - if (IS_ERR(root)) { + if (WARN_ON(IS_ERR(root))) { /* * For recovery we read the fs roots on mount, * and if we didn't find the root then we marked @@ -1998,17 +2030,14 @@ again: * memory. However there's no reason we can't * handle the error properly here just in case. */ - ASSERT(0); ret = PTR_ERR(root); goto out; } - if (root->reloc_root != reloc_root) { + if (WARN_ON(root->reloc_root != reloc_root)) { /* - * This is actually impossible without something - * going really wrong (like weird race condition - * or cosmic rays). + * This can happen if on-disk metadata has some + * corruption, e.g. bad reloc tree key offset. */ - ASSERT(0); ret = -EINVAL; goto out; } -- cgit v1.2.3-58-ga151 From 6ebcd021c92b8e4b904552e4d87283032100796d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 3 Aug 2023 17:20:43 +0800 Subject: btrfs: reject invalid reloc tree root keys with stack dump [BUG] Syzbot reported a crash that an ASSERT() got triggered inside prepare_to_merge(). That ASSERT() makes sure the reloc tree is properly pointed back by its subvolume tree. [CAUSE] After more debugging output, it turns out we had an invalid reloc tree: BTRFS error (device loop1): reloc tree mismatch, root 8 has no reloc root, expect reloc root key (-8, 132, 8) gen 17 Note the above root key is (TREE_RELOC_OBJECTID, ROOT_ITEM, QUOTA_TREE_OBJECTID), meaning it's a reloc tree for quota tree. But reloc trees can only exist for subvolumes, as for non-subvolume trees, we just COW the involved tree block, no need to create a reloc tree since those tree blocks won't be shared with other trees. Only subvolumes tree can share tree blocks with other trees (thus they have BTRFS_ROOT_SHAREABLE flag). Thus this new debug output proves my previous assumption that corrupted on-disk data can trigger that ASSERT(). [FIX] Besides the dedicated fix and the graceful exit, also let tree-checker to check such root keys, to make sure reloc trees can only exist for subvolumes. CC: stable@vger.kernel.org # 5.15+ Reported-by: syzbot+ae97a827ae1c3336bbb4@syzkaller.appspotmail.com Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 ++- fs/btrfs/tree-checker.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 11b1ac716f3b..a9a2c5446c18 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1103,7 +1103,8 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_drew_lock_init(&root->snapshot_lock); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - !btrfs_is_data_reloc_root(root)) { + !btrfs_is_data_reloc_root(root) && + is_fstree(root->root_key.objectid)) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 351ba9e90675..11d81e39ef4e 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -446,6 +446,20 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_item_key_to_cpu(leaf, &item_key, slot); is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); + /* + * Bad rootid for reloc trees. + * + * Reloc trees are only for subvolume trees, other trees only need + * to be COWed to be relocated. + */ + if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID && + !is_fstree(key->offset))) { + generic_err(leaf, slot, + "invalid reloc tree for root %lld, root id is not a subvolume tree", + key->offset); + return -EUCLEAN; + } + /* No such tree id */ if (unlikely(key->objectid == 0)) { if (is_root_item) -- cgit v1.2.3-58-ga151 From 92fb94b69c6accf1e49fff699640fa0ce03dc910 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 Aug 2023 09:20:24 -0400 Subject: btrfs: set cache_block_group_error if we find an error We set cache_block_group_error if btrfs_cache_block_group() returns an error, this is because we could end up not finding space to allocate and mistakenly return -ENOSPC, and which could then abort the transaction with the incorrect errno, and in the case of ENOSPC result in a WARN_ON() that will trip up tests like generic/475. However there's the case where multiple threads can be racing, one thread gets the proper error, and the other thread doesn't actually call btrfs_cache_block_group(), it instead sees ->cached == BTRFS_CACHE_ERROR. Again the result is the same, we fail to allocate our space and return -ENOSPC. Instead we need to set cache_block_group_error to -EIO in this case to make sure that if we do not make our allocation we get the appropriate error returned back to the caller. CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 911908ea5f6f..f396a9afa403 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4310,8 +4310,11 @@ have_block_group: ret = 0; } - if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { + if (!cache_block_group_error) + cache_block_group_error = -EIO; goto loop; + } if (!find_free_extent_check_size_class(ffe_ctl, block_group)) goto loop; -- cgit v1.2.3-58-ga151