From bdcdd86ca94b5e9faa18d6f4d3dda660ac5c887e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 00:54:40 +0000 Subject: btrfs: fix assertion failure and blocking during nowait buffered write When doing a nowait buffered write we can trigger the following assertion: [11138.437027] assertion failed: !path->nowait, in fs/btrfs/ctree.c:4658 [11138.438251] ------------[ cut here ]------------ [11138.438254] kernel BUG at fs/btrfs/messages.c:259! [11138.438762] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI [11138.439450] CPU: 4 PID: 1091021 Comm: fsstress Not tainted 6.1.0-rc4-btrfs-next-128 #1 [11138.440611] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [11138.442553] RIP: 0010:btrfs_assertfail+0x19/0x1b [btrfs] [11138.443583] Code: 5b 41 5a 41 (...) [11138.446437] RSP: 0018:ffffbaf0cf05b840 EFLAGS: 00010246 [11138.447235] RAX: 0000000000000039 RBX: ffffbaf0cf05b938 RCX: 0000000000000000 [11138.448303] RDX: 0000000000000000 RSI: ffffffffb2ef59f6 RDI: 00000000ffffffff [11138.449370] RBP: ffff9165f581eb68 R08: 00000000ffffffff R09: 0000000000000001 [11138.450493] R10: ffff9167a88421f8 R11: 0000000000000000 R12: ffff9164981b1000 [11138.451661] R13: 000000008c8f1000 R14: ffff9164991d4000 R15: ffff9164981b1000 [11138.452225] FS: 00007f1438a66440(0000) GS:ffff9167ad600000(0000) knlGS:0000000000000000 [11138.452949] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [11138.453394] CR2: 00007f1438a64000 CR3: 0000000100c36002 CR4: 0000000000370ee0 [11138.454057] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [11138.454879] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [11138.455779] Call Trace: [11138.456211] [11138.456598] btrfs_next_old_leaf.cold+0x18/0x1d [btrfs] [11138.457827] ? kmem_cache_alloc+0x18d/0x2a0 [11138.458516] btrfs_lookup_csums_range+0x149/0x4d0 [btrfs] [11138.459407] csum_exist_in_range+0x56/0x110 [btrfs] [11138.460271] can_nocow_file_extent+0x27c/0x310 [btrfs] [11138.461155] can_nocow_extent+0x1ec/0x2e0 [btrfs] [11138.461672] btrfs_check_nocow_lock+0x114/0x1c0 [btrfs] [11138.462951] btrfs_buffered_write+0x44c/0x8e0 [btrfs] [11138.463482] btrfs_do_write_iter+0x42b/0x5f0 [btrfs] [11138.463982] ? lock_release+0x153/0x4a0 [11138.464347] io_write+0x11b/0x570 [11138.464660] ? lock_release+0x153/0x4a0 [11138.465213] ? lock_is_held_type+0xe8/0x140 [11138.466003] io_issue_sqe+0x63/0x4a0 [11138.466339] io_submit_sqes+0x238/0x770 [11138.466741] __do_sys_io_uring_enter+0x37b/0xb10 [11138.467206] ? lock_is_held_type+0xe8/0x140 [11138.467879] ? syscall_enter_from_user_mode+0x1d/0x50 [11138.468688] do_syscall_64+0x38/0x90 [11138.469265] entry_SYSCALL_64_after_hwframe+0x63/0xcd [11138.470017] RIP: 0033:0x7f1438c539e6 This is because to check if we can NOCOW, we check that if we can NOCOW into an extent (it's prealloc extent or the inode has NOCOW attribute), and then check if there are csums for the extent's range in the csum tree. The search may leave us beyond the last slot of a leaf, and then when we call btrfs_next_leaf() we end up at btrfs_next_old_leaf() with a time_seq of 0. This triggers a failure of the first assertion at btrfs_next_old_leaf(), since we have a nowait path. With assertions disabled, we simply don't respect the NOWAIT semantics, allowing the write to block on locks or blocking on IO for reading an extent buffer from disk. Fix this by: 1) Triggering the assertion only if time_seq is not 0, which means that search is being done by a tree mod log user, and in the buffered and direct IO write paths we don't use the tree mod log; 2) Implementing NOWAIT semantics at btrfs_next_old_leaf(). Any failure to lock an extent buffer should return immediately and not retry the search, as well as if we need to do IO to read an extent buffer from disk. Fixes: c922b016f353 ("btrfs: assert nowait mode is not used for some btree search functions") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a9543f01184c..dcb510f38dda 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4663,7 +4663,12 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int ret; int i; - ASSERT(!path->nowait); + /* + * The nowait semantics are used only for write paths, where we don't + * use the tree mod log and sequence numbers. + */ + if (time_seq) + ASSERT(!path->nowait); nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) @@ -4683,7 +4688,14 @@ again: if (path->need_commit_sem) { path->need_commit_sem = 0; need_commit_sem = true; - down_read(&fs_info->commit_root_sem); + if (path->nowait) { + if (!down_read_trylock(&fs_info->commit_root_sem)) { + ret = -EAGAIN; + goto done; + } + } else { + down_read(&fs_info->commit_root_sem); + } } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); } @@ -4759,7 +4771,7 @@ again: next = c; ret = read_block_for_search(root, path, &next, level, slot, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4769,6 +4781,10 @@ again: if (!path->skip_locking) { ret = btrfs_try_tree_read_lock(next); + if (!ret && path->nowait) { + ret = -EAGAIN; + goto done; + } if (!ret && time_seq) { /* * If we don't get the lock, we may be racing @@ -4799,7 +4815,7 @@ again: ret = read_block_for_search(root, path, &next, level, 0, &key); - if (ret == -EAGAIN) + if (ret == -EAGAIN && !path->nowait) goto again; if (ret < 0) { @@ -4807,8 +4823,16 @@ again: goto done; } - if (!path->skip_locking) - btrfs_tree_read_lock(next); + if (!path->skip_locking) { + if (path->nowait) { + if (!btrfs_try_tree_read_lock(next)) { + ret = -EAGAIN; + goto done; + } + } else { + btrfs_tree_read_lock(next); + } + } } ret = 0; done: -- cgit v1.2.3-58-ga151 From b740d806166979488e798e41743aaec051f2443f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 7 Nov 2022 11:44:51 -0500 Subject: btrfs: free btrfs_path before copying root refs to userspace Syzbot reported the following lockdep splat ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc7-syzkaller-18095-gbbed346d5a96 #0 Not tainted ------------------------------------------------------ syz-executor307/3029 is trying to acquire lock: ffff0000c02525d8 (&mm->mmap_lock){++++}-{3:3}, at: __might_fault+0x54/0xb4 mm/memory.c:5576 but task is already holding lock: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline] ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline] ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (btrfs-root-00){++++}-{3:3}: down_read_nested+0x64/0x84 kernel/locking/rwsem.c:1624 __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline] btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline] btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279 btrfs_search_slot_get_root+0x74/0x338 fs/btrfs/ctree.c:1637 btrfs_search_slot+0x1b0/0xfd8 fs/btrfs/ctree.c:1944 btrfs_update_root+0x6c/0x5a0 fs/btrfs/root-tree.c:132 commit_fs_roots+0x1f0/0x33c fs/btrfs/transaction.c:1459 btrfs_commit_transaction+0x89c/0x12d8 fs/btrfs/transaction.c:2343 flush_space+0x66c/0x738 fs/btrfs/space-info.c:786 btrfs_async_reclaim_metadata_space+0x43c/0x4e0 fs/btrfs/space-info.c:1059 process_one_work+0x2d8/0x504 kernel/workqueue.c:2289 worker_thread+0x340/0x610 kernel/workqueue.c:2436 kthread+0x12c/0x158 kernel/kthread.c:376 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:860 -> #2 (&fs_info->reloc_mutex){+.+.}-{3:3}: __mutex_lock_common+0xd4/0xca8 kernel/locking/mutex.c:603 __mutex_lock kernel/locking/mutex.c:747 [inline] mutex_lock_nested+0x38/0x44 kernel/locking/mutex.c:799 btrfs_record_root_in_trans fs/btrfs/transaction.c:516 [inline] start_transaction+0x248/0x944 fs/btrfs/transaction.c:752 btrfs_start_transaction+0x34/0x44 fs/btrfs/transaction.c:781 btrfs_create_common+0xf0/0x1b4 fs/btrfs/inode.c:6651 btrfs_create+0x8c/0xb0 fs/btrfs/inode.c:6697 lookup_open fs/namei.c:3413 [inline] open_last_lookups fs/namei.c:3481 [inline] path_openat+0x804/0x11c4 fs/namei.c:3688 do_filp_open+0xdc/0x1b8 fs/namei.c:3718 do_sys_openat2+0xb8/0x22c fs/open.c:1313 do_sys_open fs/open.c:1329 [inline] __do_sys_openat fs/open.c:1345 [inline] __se_sys_openat fs/open.c:1340 [inline] __arm64_sys_openat+0xb0/0xe0 fs/open.c:1340 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 -> #1 (sb_internal#2){.+.+}-{0:0}: percpu_down_read include/linux/percpu-rwsem.h:51 [inline] __sb_start_write include/linux/fs.h:1826 [inline] sb_start_intwrite include/linux/fs.h:1948 [inline] start_transaction+0x360/0x944 fs/btrfs/transaction.c:683 btrfs_join_transaction+0x30/0x40 fs/btrfs/transaction.c:795 btrfs_dirty_inode+0x50/0x140 fs/btrfs/inode.c:6103 btrfs_update_time+0x1c0/0x1e8 fs/btrfs/inode.c:6145 inode_update_time fs/inode.c:1872 [inline] touch_atime+0x1f0/0x4a8 fs/inode.c:1945 file_accessed include/linux/fs.h:2516 [inline] btrfs_file_mmap+0x50/0x88 fs/btrfs/file.c:2407 call_mmap include/linux/fs.h:2192 [inline] mmap_region+0x7fc/0xc14 mm/mmap.c:1752 do_mmap+0x644/0x97c mm/mmap.c:1540 vm_mmap_pgoff+0xe8/0x1d0 mm/util.c:552 ksys_mmap_pgoff+0x1cc/0x278 mm/mmap.c:1586 __do_sys_mmap arch/arm64/kernel/sys.c:28 [inline] __se_sys_mmap arch/arm64/kernel/sys.c:21 [inline] __arm64_sys_mmap+0x58/0x6c arch/arm64/kernel/sys.c:21 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 -> #0 (&mm->mmap_lock){++++}-{3:3}: check_prev_add kernel/locking/lockdep.c:3095 [inline] check_prevs_add kernel/locking/lockdep.c:3214 [inline] validate_chain kernel/locking/lockdep.c:3829 [inline] __lock_acquire+0x1530/0x30a4 kernel/locking/lockdep.c:5053 lock_acquire+0x100/0x1f8 kernel/locking/lockdep.c:5666 __might_fault+0x7c/0xb4 mm/memory.c:5577 _copy_to_user include/linux/uaccess.h:134 [inline] copy_to_user include/linux/uaccess.h:160 [inline] btrfs_ioctl_get_subvol_rootref+0x3a8/0x4bc fs/btrfs/ioctl.c:3203 btrfs_ioctl+0xa08/0xa64 fs/btrfs/ioctl.c:5556 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __arm64_sys_ioctl+0xd0/0x140 fs/ioctl.c:856 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 other info that might help us debug this: Chain exists of: &mm->mmap_lock --> &fs_info->reloc_mutex --> btrfs-root-00 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-root-00); lock(&fs_info->reloc_mutex); lock(btrfs-root-00); lock(&mm->mmap_lock); *** DEADLOCK *** 1 lock held by syz-executor307/3029: #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock fs/btrfs/locking.c:134 [inline] #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_tree_read_lock fs/btrfs/locking.c:140 [inline] #0: ffff0000c958a608 (btrfs-root-00){++++}-{3:3}, at: btrfs_read_lock_root_node+0x13c/0x1c0 fs/btrfs/locking.c:279 stack backtrace: CPU: 0 PID: 3029 Comm: syz-executor307 Not tainted 6.0.0-rc7-syzkaller-18095-gbbed346d5a96 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/30/2022 Call trace: dump_backtrace+0x1c4/0x1f0 arch/arm64/kernel/stacktrace.c:156 show_stack+0x2c/0x54 arch/arm64/kernel/stacktrace.c:163 __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x104/0x16c lib/dump_stack.c:106 dump_stack+0x1c/0x58 lib/dump_stack.c:113 print_circular_bug+0x2c4/0x2c8 kernel/locking/lockdep.c:2053 check_noncircular+0x14c/0x154 kernel/locking/lockdep.c:2175 check_prev_add kernel/locking/lockdep.c:3095 [inline] check_prevs_add kernel/locking/lockdep.c:3214 [inline] validate_chain kernel/locking/lockdep.c:3829 [inline] __lock_acquire+0x1530/0x30a4 kernel/locking/lockdep.c:5053 lock_acquire+0x100/0x1f8 kernel/locking/lockdep.c:5666 __might_fault+0x7c/0xb4 mm/memory.c:5577 _copy_to_user include/linux/uaccess.h:134 [inline] copy_to_user include/linux/uaccess.h:160 [inline] btrfs_ioctl_get_subvol_rootref+0x3a8/0x4bc fs/btrfs/ioctl.c:3203 btrfs_ioctl+0xa08/0xa64 fs/btrfs/ioctl.c:5556 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __arm64_sys_ioctl+0xd0/0x140 fs/ioctl.c:856 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall arch/arm64/kernel/syscall.c:52 [inline] el0_svc_common+0x138/0x220 arch/arm64/kernel/syscall.c:142 do_el0_svc+0x48/0x164 arch/arm64/kernel/syscall.c:206 el0_svc+0x58/0x150 arch/arm64/kernel/entry-common.c:636 el0t_64_sync_handler+0x84/0xf0 arch/arm64/kernel/entry-common.c:654 el0t_64_sync+0x18c/0x190 arch/arm64/kernel/entry.S:581 We do generally the right thing here, copying the references into a temporary buffer, however we are still holding the path when we do copy_to_user from the temporary buffer. Fix this by freeing the path before we copy to user space. Reported-by: syzbot+4ef9e52e464c6ff47d9d@syzkaller.appspotmail.com CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d5dd8bed1488..89b8d14cb68c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3194,6 +3194,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, } out: + btrfs_free_path(path); + if (!ret || ret == -EOVERFLOW) { rootrefs->num_items = found; /* update min_treeid for next search */ @@ -3205,7 +3207,6 @@ out: } kfree(rootrefs); - btrfs_free_path(path); return ret; } -- cgit v1.2.3-58-ga151 From 418ffb9e3cf6c4e2574d3a732b724916684bd133 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 10 Nov 2022 11:36:28 +0530 Subject: btrfs: free btrfs_path before copying inodes to userspace btrfs_ioctl_logical_to_ino() frees the search path after the userspace copy from the temp buffer @inodes. Which potentially can lead to a lock splat. Fix this by freeing the path before we copy @inodes to userspace. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 89b8d14cb68c..b595f2c6dfc9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4282,21 +4282,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, size = min_t(u32, loi->size, SZ_16M); } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); - inodes = NULL; - goto out; + goto out_loi; } + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, inodes, ignore_offset); + btrfs_free_path(path); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4308,7 +4307,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, ret = -EFAULT; out: - btrfs_free_path(path); kvfree(inodes); out_loi: kfree(loi); -- cgit v1.2.3-58-ga151 From 8cf96b409d9b3946ece58ced13f92d0f775b0442 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 10 Nov 2022 11:36:29 +0530 Subject: btrfs: free btrfs_path before copying fspath to userspace btrfs_ioctl_ino_to_path() frees the search path after the userspace copy from the temp buffer @ipath->fspath. Which potentially can lead to a lock splat warning. Fix this by freeing the path before we copy it to userspace. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b595f2c6dfc9..df5b893494fa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4232,6 +4232,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } + btrfs_free_path(path); + path = NULL; ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, ipath->fspath, size); if (ret) { -- cgit v1.2.3-58-ga151 From 013c1c5585ebcfb19c88efe79063d0463b1b6159 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 10 Nov 2022 11:36:31 +0530 Subject: btrfs: free btrfs_path before copying subvol info to userspace btrfs_ioctl_get_subvol_info() frees the search path after the userspace copy from the temp buffer @subvol_info. This can lead to a lock splat warning. Fix this by freeing the path before we copy it to userspace. CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index df5b893494fa..5ba2e810dc6e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3105,6 +3105,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) } } + btrfs_free_path(path); + path = NULL; if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) ret = -EFAULT; -- cgit v1.2.3-58-ga151 From c51f0e6a1254b3ac2d308e1c6fd8fb936992b455 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 10:39:44 +0100 Subject: btrfs: zoned: fix missing endianness conversion in sb_write_pointer generation is an on-disk __le64 value, so use btrfs_super_generation to convert it to host endian before comparing it. Fixes: 12659251ca5d ("btrfs: implement log-structured superblock for ZONED mode") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1912abf6d020..44d8177eeffc 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -134,7 +134,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, super[i] = page_address(page[i]); } - if (super[0]->generation > super[1]->generation) + if (btrfs_super_generation(super[0]) > + btrfs_super_generation(super[1])) sector = zones[1].start; else sector = zones[0].start; -- cgit v1.2.3-58-ga151 From a11452a3709e217492798cf3686ac2cc8eb3fb51 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 15 Nov 2022 16:29:44 +0000 Subject: btrfs: send: avoid unaligned encoded writes when attempting to clone range When trying to see if we can clone a file range, there are cases where we end up sending two write operations in case the inode from the source root has an i_size that is not sector size aligned and the length from the current offset to its i_size is less than the remaining length we are trying to clone. Issuing two write operations when we could instead issue a single write operation is not incorrect. However it is not optimal, specially if the extents are compressed and the flag BTRFS_SEND_FLAG_COMPRESSED was passed to the send ioctl. In that case we can end up sending an encoded write with an offset that is not sector size aligned, which makes the receiver fallback to decompressing the data and writing it using regular buffered IO (so re-compressing the data in case the fs is mounted with compression enabled), because encoded writes fail with -EINVAL when an offset is not sector size aligned. The following example, which triggered a bug in the receiver code for the fallback logic of decompressing + regular buffer IO and is fixed by the patchset referred in a Link at the bottom of this changelog, is an example where we have the non-optimal behaviour due to an unaligned encoded write: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV > /dev/null mount -o compress $DEV $MNT # File foo has a size of 33K, not aligned to the sector size. xfs_io -f -c "pwrite -S 0xab 0 33K" $MNT/foo xfs_io -f -c "pwrite -S 0xcd 0 64K" $MNT/bar # Now clone the first 32K of file bar into foo at offset 0. xfs_io -c "reflink $MNT/bar 0 0 32K" $MNT/foo # Snapshot the default subvolume and create a full send stream (v2). btrfs subvolume snapshot -r $MNT $MNT/snap btrfs send --compressed-data -f /tmp/test.send $MNT/snap echo -e "\nFile bar in the original filesystem:" od -A d -t x1 $MNT/snap/bar umount $MNT mkfs.btrfs -f $DEV > /dev/null mount $DEV $MNT echo -e "\nReceiving stream in a new filesystem..." btrfs receive -f /tmp/test.send $MNT echo -e "\nFile bar in the new filesystem:" od -A d -t x1 $MNT/snap/bar umount $MNT Before this patch, the send stream included one regular write and one encoded write for file 'bar', with the later being not sector size aligned and causing the receiver to fallback to decompression + buffered writes. The output of the btrfs receive command in verbose mode (-vvv): (...) mkfile o258-7-0 rename o258-7-0 -> bar utimes clone bar - source=foo source offset=0 offset=0 length=32768 write bar - offset=32768 length=1024 encoded_write bar - offset=33792, len=4096, unencoded_offset=33792, unencoded_file_len=31744, unencoded_len=65536, compression=1, encryption=0 encoded_write bar - falling back to decompress and write due to errno 22 ("Invalid argument") (...) This patch avoids the regular write followed by an unaligned encoded write so that we end up sending a single encoded write that is aligned. So after this patch the stream content is (output of btrfs receive -vvv): (...) mkfile o258-7-0 rename o258-7-0 -> bar utimes clone bar - source=foo source offset=0 offset=0 length=32768 encoded_write bar - offset=32768, len=4096, unencoded_offset=32768, unencoded_file_len=32768, unencoded_len=65536, compression=1, encryption=0 (...) So we get more optimal behaviour and avoid the silent data loss bug in versions of btrfs-progs affected by the bug referred by the Link tag below (btrfs-progs v5.19, v5.19.1, v6.0 and v6.0.1). Link: https://lore.kernel.org/linux-btrfs/cover.1668529099.git.fdmanana@suse.com/ Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 145c84b44fd0..1c4b693ee4a3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5702,6 +5702,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, u64 ext_len; u64 clone_len; u64 clone_data_offset; + bool crossed_src_i_size = false; if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(clone_root->root, path); @@ -5759,8 +5760,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, if (key.offset >= clone_src_i_size) break; - if (key.offset + ext_len > clone_src_i_size) + if (key.offset + ext_len > clone_src_i_size) { ext_len = clone_src_i_size - key.offset; + crossed_src_i_size = true; + } clone_data_offset = btrfs_file_extent_offset(leaf, ei); if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) { @@ -5821,6 +5824,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, ret = send_clone(sctx, offset, clone_len, clone_root); } + } else if (crossed_src_i_size && clone_len < len) { + /* + * If we are at i_size of the clone source inode and we + * can not clone from it, terminate the loop. This is + * to avoid sending two write operations, one with a + * length matching clone_len and the final one after + * this loop with a length of len - clone_len. + * + * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED + * was passed to the send ioctl), this helps avoid + * sending an encoded write for an offset that is not + * sector size aligned, in case the i_size of the source + * inode is not sector size aligned. That will make the + * receiver fallback to decompression of the data and + * writing it using regular buffered IO, therefore while + * not incorrect, it's not optimal due decompression and + * possible re-compression at the receiver. + */ + break; } else { ret = send_extent_data(sctx, dst_path, offset, clone_len); -- cgit v1.2.3-58-ga151 From f7e942b5bb35d8e3af54053d19a6bf04143a3955 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Wed, 16 Nov 2022 22:23:54 +0800 Subject: btrfs: qgroup: fix sleep from invalid context bug in btrfs_qgroup_inherit() Syzkaller reported BUG as follows: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:274 Call Trace: dump_stack_lvl+0xcd/0x134 __might_resched.cold+0x222/0x26b kmem_cache_alloc+0x2e7/0x3c0 update_qgroup_limit_item+0xe1/0x390 btrfs_qgroup_inherit+0x147b/0x1ee0 create_subvol+0x4eb/0x1710 btrfs_mksubvol+0xfe5/0x13f0 __btrfs_ioctl_snap_create+0x2b0/0x430 btrfs_ioctl_snap_create_v2+0x25a/0x520 btrfs_ioctl+0x2a1c/0x5ce0 __x64_sys_ioctl+0x193/0x200 do_syscall_64+0x35/0x80 Fix this by calling qgroup_dirty() on @dstqgroup, and update limit item in btrfs_run_qgroups() later outside of the spinlock context. CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Qu Wenruo Signed-off-by: ChenXiaoSong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9334c3157c22..b74105a10f16 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2951,14 +2951,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, dstgroup->rsv_rfer = inherit->lim.rsv_rfer; dstgroup->rsv_excl = inherit->lim.rsv_excl; - ret = update_qgroup_limit_item(trans, dstgroup); - if (ret) { - qgroup_mark_inconsistent(fs_info); - btrfs_info(fs_info, - "unable to update quota limit for %llu", - dstgroup->qgroupid); - goto unlock; - } + qgroup_dirty(fs_info, dstgroup); } if (srcid) { -- cgit v1.2.3-58-ga151 From 8fe97d47b52ae1ad130470b1780f0ded4ba609a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 20 Nov 2022 13:43:03 +0100 Subject: btrfs: use kvcalloc in btrfs_get_dev_zone_info Otherwise the kernel memory allocator seems to be unhappy about failing order 6 allocations for the zones array, that cause 100% reproducible mount failures in my qemu setup: [26.078981] mount: page allocation failure: order:6, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null) [26.079741] CPU: 0 PID: 2965 Comm: mount Not tainted 6.1.0-rc5+ #185 [26.080181] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [26.080950] Call Trace: [26.081132] [26.081291] dump_stack_lvl+0x56/0x6f [26.081554] warn_alloc+0x117/0x140 [26.081808] ? __alloc_pages_direct_compact+0x1b5/0x300 [26.082174] __alloc_pages_slowpath.constprop.0+0xd0e/0xde0 [26.082569] __alloc_pages+0x32a/0x340 [26.082836] __kmalloc_large_node+0x4d/0xa0 [26.083133] ? trace_kmalloc+0x29/0xd0 [26.083399] kmalloc_large+0x14/0x60 [26.083654] btrfs_get_dev_zone_info+0x1b9/0xc00 [26.083980] ? _raw_spin_unlock_irqrestore+0x28/0x50 [26.084328] btrfs_get_dev_zone_info_all_devices+0x54/0x80 [26.084708] open_ctree+0xed4/0x1654 [26.084974] btrfs_mount_root.cold+0x12/0xde [26.085288] ? lock_is_held_type+0xe2/0x140 [26.085603] legacy_get_tree+0x28/0x50 [26.085876] vfs_get_tree+0x1d/0xb0 [26.086139] vfs_kern_mount.part.0+0x6c/0xb0 [26.086456] btrfs_mount+0x118/0x3a0 [26.086728] ? lock_is_held_type+0xe2/0x140 [26.087043] legacy_get_tree+0x28/0x50 [26.087323] vfs_get_tree+0x1d/0xb0 [26.087587] path_mount+0x2ba/0xbe0 [26.087850] ? _raw_spin_unlock_irqrestore+0x38/0x50 [26.088217] __x64_sys_mount+0xfe/0x140 [26.088506] do_syscall_64+0x35/0x80 [26.088776] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 44d8177eeffc..c9e2b0c85309 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -467,7 +467,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) goto out; } - zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); + zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; goto out; @@ -586,7 +586,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) } - kfree(zones); + kvfree(zones); switch (bdev_zoned_model(bdev)) { case BLK_ZONED_HM: @@ -618,7 +618,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) return 0; out: - kfree(zones); + kvfree(zones); out_free_zone_info: btrfs_destroy_dev_zone_info(device); -- cgit v1.2.3-58-ga151 From 796787c978efbbdb50e245718c784eb94f59eac4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Nov 2022 10:23:22 +0000 Subject: btrfs: do not modify log tree while holding a leaf from fs tree locked When logging an inode in full mode, or when logging xattrs or when logging the dir index items of a directory, we are modifying the log tree while holding a read lock on a leaf from the fs/subvolume tree. This can lead to a deadlock in rare circumstances, but it is a real possibility, and it was recently reported by syzbot with the following trace from lockdep: WARNING: possible circular locking dependency detected 6.1.0-rc5-next-20221116-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.1/16154 is trying to acquire lock: ffff88807e3084a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0xa1/0xf30 fs/btrfs/delayed-inode.c:256 but task is already holding lock: ffff88807df33078 (btrfs-log-00){++++}-{3:3}, at: __btrfs_tree_lock+0x32/0x3d0 fs/btrfs/locking.c:197 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (btrfs-log-00){++++}-{3:3}: down_read_nested+0x9e/0x450 kernel/locking/rwsem.c:1634 __btrfs_tree_read_lock+0x32/0x350 fs/btrfs/locking.c:135 btrfs_tree_read_lock fs/btrfs/locking.c:141 [inline] btrfs_read_lock_root_node+0x82/0x3a0 fs/btrfs/locking.c:280 btrfs_search_slot_get_root fs/btrfs/ctree.c:1678 [inline] btrfs_search_slot+0x3ca/0x2c70 fs/btrfs/ctree.c:1998 btrfs_lookup_csum+0x116/0x3f0 fs/btrfs/file-item.c:209 btrfs_csum_file_blocks+0x40e/0x1370 fs/btrfs/file-item.c:1021 log_csums.isra.0+0x244/0x2d0 fs/btrfs/tree-log.c:4258 copy_items.isra.0+0xbfb/0xed0 fs/btrfs/tree-log.c:4403 copy_inode_items_to_log+0x13d6/0x1d90 fs/btrfs/tree-log.c:5873 btrfs_log_inode+0xb19/0x4680 fs/btrfs/tree-log.c:6495 btrfs_log_inode_parent+0x890/0x2a20 fs/btrfs/tree-log.c:6982 btrfs_log_dentry_safe+0x59/0x80 fs/btrfs/tree-log.c:7083 btrfs_sync_file+0xa41/0x13c0 fs/btrfs/file.c:1921 vfs_fsync_range+0x13e/0x230 fs/sync.c:188 generic_write_sync include/linux/fs.h:2856 [inline] iomap_dio_complete+0x73a/0x920 fs/iomap/direct-io.c:128 btrfs_direct_write fs/btrfs/file.c:1536 [inline] btrfs_do_write_iter+0xba2/0x1470 fs/btrfs/file.c:1668 call_write_iter include/linux/fs.h:2160 [inline] do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735 do_iter_write+0x182/0x700 fs/read_write.c:861 vfs_iter_write+0x74/0xa0 fs/read_write.c:902 iter_file_splice_write+0x745/0xc90 fs/splice.c:686 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x114/0x180 fs/splice.c:931 splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886 do_splice_direct+0x1ab/0x280 fs/splice.c:974 do_sendfile+0xb19/0x1270 fs/read_write.c:1255 __do_sys_sendfile64 fs/read_write.c:1323 [inline] __se_sys_sendfile64 fs/read_write.c:1309 [inline] __x64_sys_sendfile64+0x259/0x2c0 fs/read_write.c:1309 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd -> #1 (btrfs-tree-00){++++}-{3:3}: __lock_release kernel/locking/lockdep.c:5382 [inline] lock_release+0x371/0x810 kernel/locking/lockdep.c:5688 up_write+0x2a/0x520 kernel/locking/rwsem.c:1614 btrfs_tree_unlock_rw fs/btrfs/locking.h:189 [inline] btrfs_unlock_up_safe+0x1e3/0x290 fs/btrfs/locking.c:238 search_leaf fs/btrfs/ctree.c:1832 [inline] btrfs_search_slot+0x265e/0x2c70 fs/btrfs/ctree.c:2074 btrfs_insert_empty_items+0xbd/0x1c0 fs/btrfs/ctree.c:4133 btrfs_insert_delayed_item+0x826/0xfa0 fs/btrfs/delayed-inode.c:746 btrfs_insert_delayed_items fs/btrfs/delayed-inode.c:824 [inline] __btrfs_commit_inode_delayed_items fs/btrfs/delayed-inode.c:1111 [inline] __btrfs_run_delayed_items+0x280/0x590 fs/btrfs/delayed-inode.c:1153 flush_space+0x147/0xe90 fs/btrfs/space-info.c:728 btrfs_async_reclaim_metadata_space+0x541/0xc10 fs/btrfs/space-info.c:1086 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x669/0x1090 kernel/workqueue.c:2436 kthread+0x2e8/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: check_prev_add kernel/locking/lockdep.c:3097 [inline] check_prevs_add kernel/locking/lockdep.c:3216 [inline] validate_chain kernel/locking/lockdep.c:3831 [inline] __lock_acquire+0x2a43/0x56d0 kernel/locking/lockdep.c:5055 lock_acquire kernel/locking/lockdep.c:5668 [inline] lock_acquire+0x1e3/0x630 kernel/locking/lockdep.c:5633 __mutex_lock_common kernel/locking/mutex.c:603 [inline] __mutex_lock+0x12f/0x1360 kernel/locking/mutex.c:747 __btrfs_release_delayed_node.part.0+0xa1/0xf30 fs/btrfs/delayed-inode.c:256 __btrfs_release_delayed_node fs/btrfs/delayed-inode.c:251 [inline] btrfs_release_delayed_node fs/btrfs/delayed-inode.c:281 [inline] btrfs_remove_delayed_node+0x52/0x60 fs/btrfs/delayed-inode.c:1285 btrfs_evict_inode+0x511/0xf30 fs/btrfs/inode.c:5554 evict+0x2ed/0x6b0 fs/inode.c:664 dispose_list+0x117/0x1e0 fs/inode.c:697 prune_icache_sb+0xeb/0x150 fs/inode.c:896 super_cache_scan+0x391/0x590 fs/super.c:106 do_shrink_slab+0x464/0xce0 mm/vmscan.c:843 shrink_slab_memcg mm/vmscan.c:912 [inline] shrink_slab+0x388/0x660 mm/vmscan.c:991 shrink_node_memcgs mm/vmscan.c:6088 [inline] shrink_node+0x93d/0x1f30 mm/vmscan.c:6117 shrink_zones mm/vmscan.c:6355 [inline] do_try_to_free_pages+0x3b4/0x17a0 mm/vmscan.c:6417 try_to_free_mem_cgroup_pages+0x3a4/0xa70 mm/vmscan.c:6732 reclaim_high.constprop.0+0x182/0x230 mm/memcontrol.c:2393 mem_cgroup_handle_over_high+0x190/0x520 mm/memcontrol.c:2578 try_charge_memcg+0xe0c/0x12f0 mm/memcontrol.c:2816 try_charge mm/memcontrol.c:2827 [inline] charge_memcg+0x90/0x3b0 mm/memcontrol.c:6889 __mem_cgroup_charge+0x2b/0x90 mm/memcontrol.c:6910 mem_cgroup_charge include/linux/memcontrol.h:667 [inline] __filemap_add_folio+0x615/0xf80 mm/filemap.c:852 filemap_add_folio+0xaf/0x1e0 mm/filemap.c:934 __filemap_get_folio+0x389/0xd80 mm/filemap.c:1976 pagecache_get_page+0x2e/0x280 mm/folio-compat.c:104 find_or_create_page include/linux/pagemap.h:612 [inline] alloc_extent_buffer+0x2b9/0x1580 fs/btrfs/extent_io.c:4588 btrfs_init_new_buffer fs/btrfs/extent-tree.c:4869 [inline] btrfs_alloc_tree_block+0x2e1/0x1320 fs/btrfs/extent-tree.c:4988 __btrfs_cow_block+0x3b2/0x1420 fs/btrfs/ctree.c:440 btrfs_cow_block+0x2fa/0x950 fs/btrfs/ctree.c:595 btrfs_search_slot+0x11b0/0x2c70 fs/btrfs/ctree.c:2038 btrfs_update_root+0xdb/0x630 fs/btrfs/root-tree.c:137 update_log_root fs/btrfs/tree-log.c:2841 [inline] btrfs_sync_log+0xbfb/0x2870 fs/btrfs/tree-log.c:3064 btrfs_sync_file+0xdb9/0x13c0 fs/btrfs/file.c:1947 vfs_fsync_range+0x13e/0x230 fs/sync.c:188 generic_write_sync include/linux/fs.h:2856 [inline] iomap_dio_complete+0x73a/0x920 fs/iomap/direct-io.c:128 btrfs_direct_write fs/btrfs/file.c:1536 [inline] btrfs_do_write_iter+0xba2/0x1470 fs/btrfs/file.c:1668 call_write_iter include/linux/fs.h:2160 [inline] do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735 do_iter_write+0x182/0x700 fs/read_write.c:861 vfs_iter_write+0x74/0xa0 fs/read_write.c:902 iter_file_splice_write+0x745/0xc90 fs/splice.c:686 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x114/0x180 fs/splice.c:931 splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886 do_splice_direct+0x1ab/0x280 fs/splice.c:974 do_sendfile+0xb19/0x1270 fs/read_write.c:1255 __do_sys_sendfile64 fs/read_write.c:1323 [inline] __se_sys_sendfile64 fs/read_write.c:1309 [inline] __x64_sys_sendfile64+0x259/0x2c0 fs/read_write.c:1309 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd other info that might help us debug this: Chain exists of: &delayed_node->mutex --> btrfs-tree-00 --> btrfs-log-00 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-log-00); lock(btrfs-tree-00); lock(btrfs-log-00); lock(&delayed_node->mutex); Holding a read lock on a leaf from a fs/subvolume tree creates a nasty lock dependency when we are COWing extent buffers for the log tree and we have two tasks modifying the log tree, with each one in one of the following 2 scenarios: 1) Modifying the log tree triggers an extent buffer allocation while holding a write lock on a parent extent buffer from the log tree. Allocating the pages for an extent buffer, or the extent buffer struct, can trigger inode eviction and finally the inode eviction will trigger a release/remove of a delayed node, which requires taking the delayed node's mutex; 2) Allocating a metadata extent for a log tree can trigger the async reclaim thread and make us wait for it to release enough space and unblock our reservation ticket. The reclaim thread can start flushing delayed items, and that in turn results in the need to lock delayed node mutexes and in the need to write lock extent buffers of a subvolume tree - all this while holding a write lock on the parent extent buffer in the log tree. So one task in scenario 1) running in parallel with another task in scenario 2) could lead to a deadlock, one wanting to lock a delayed node mutex while having a read lock on a leaf from the subvolume, while the other is holding the delayed node's mutex and wants to write lock the same subvolume leaf for flushing delayed items. Fix this by cloning the leaf of the fs/subvolume tree, release/unlock the fs/subvolume leaf and use the clone leaf instead. Reported-by: syzbot+9b7c21f486f5e7f8d029@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/000000000000ccc93c05edc4d8cf@google.com/ CC: stable@vger.kernel.org # 6.0+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 813986e38258..c3cf3dabe0b1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3694,15 +3694,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, u64 *last_old_dentry_offset) { struct btrfs_root *log = inode->root->log_root; - struct extent_buffer *src = path->nodes[0]; - const int nritems = btrfs_header_nritems(src); + struct extent_buffer *src; + const int nritems = btrfs_header_nritems(path->nodes[0]); const u64 ino = btrfs_ino(inode); bool last_found = false; int batch_start = 0; int batch_size = 0; int i; - for (i = path->slots[0]; i < nritems; i++) { + /* + * We need to clone the leaf, release the read lock on it, and use the + * clone before modifying the log tree. See the comment at copy_items() + * about why we need to do this. + */ + src = btrfs_clone_extent_buffer(path->nodes[0]); + if (!src) + return -ENOMEM; + + i = path->slots[0]; + btrfs_release_path(path); + path->nodes[0] = src; + path->slots[0] = i; + + for (; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; @@ -4303,7 +4317,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, { struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *extent; - struct extent_buffer *src = src_path->nodes[0]; + struct extent_buffer *src; int ret = 0; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -4314,6 +4328,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); + /* + * To keep lockdep happy and avoid deadlocks, clone the source leaf and + * use the clone. This is because otherwise we would be changing the log + * tree, to insert items from the subvolume tree or insert csum items, + * while holding a read lock on a leaf from the subvolume tree, which + * creates a nasty lock dependency when COWing log tree nodes/leaves: + * + * 1) Modifying the log tree triggers an extent buffer allocation while + * holding a write lock on a parent extent buffer from the log tree. + * Allocating the pages for an extent buffer, or the extent buffer + * struct, can trigger inode eviction and finally the inode eviction + * will trigger a release/remove of a delayed node, which requires + * taking the delayed node's mutex; + * + * 2) Allocating a metadata extent for a log tree can trigger the async + * reclaim thread and make us wait for it to release enough space and + * unblock our reservation ticket. The reclaim thread can start + * flushing delayed items, and that in turn results in the need to + * lock delayed node mutexes and in the need to write lock extent + * buffers of a subvolume tree - all this while holding a write lock + * on the parent extent buffer in the log tree. + * + * So one task in scenario 1) running in parallel with another task in + * scenario 2) could lead to a deadlock, one wanting to lock a delayed + * node mutex while having a read lock on a leaf from the subvolume, + * while the other is holding the delayed node's mutex and wants to + * write lock the same subvolume leaf for flushing delayed items. + */ + src = btrfs_clone_extent_buffer(src_path->nodes[0]); + if (!src) + return -ENOMEM; + + i = src_path->slots[0]; + btrfs_release_path(src_path); + src_path->nodes[0] = src; + src_path->slots[0] = i; + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); if (!ins_data) -- cgit v1.2.3-58-ga151 From ffdbb44f2f23f963b8f5672e35c3a26088177a62 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 22 Nov 2022 19:50:02 +0800 Subject: btrfs: sysfs: normalize the error handling branch in btrfs_init_sysfs() Although kset_unregister() can eventually remove all attribute files, explicitly rolling back with the matching function makes the code logic look clearer. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Zhen Lei Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 699b54b3acaa..74fef1f49c35 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -2321,8 +2321,11 @@ int __init btrfs_init_sysfs(void) #ifdef CONFIG_BTRFS_DEBUG ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); - if (ret) - goto out2; + if (ret) { + sysfs_unmerge_group(&btrfs_kset->kobj, + &btrfs_static_feature_attr_group); + goto out_remove_group; + } #endif return 0; -- cgit v1.2.3-58-ga151