diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-01 12:29:20 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-01 12:29:20 -0800 |
commit | 7505aa147adb10913c1b72e947006b6070753eb6 (patch) | |
tree | 72eb1c2b0e4933261e20db034443a33d5a8d2184 | |
parent | 3aec97e0c793353318235b5e9031f91b8f09241e (diff) | |
parent | e2b54eaf28df0c978626c9736b94f003b523b451 (diff) |
Merge tag 'for-6.8-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba:
- fix freeing allocated id for anon dev when snapshot creation fails
- fiemap fixes:
- followup for a recent deadlock fix, ranges that fiemap can access
can still race with ordered extent completion
- make sure fiemap with SYNC flag does not race with writes
* tag 'for-6.8-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: fix double free of anonymous device after snapshot creation failure
btrfs: ensure fiemap doesn't race with writes when FIEMAP_FLAG_SYNC is given
btrfs: fix race between ordered extent completion and fiemap
-rw-r--r-- | fs/btrfs/disk-io.c | 22 | ||||
-rw-r--r-- | fs/btrfs/disk-io.h | 2 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 124 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 22 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 2 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 2 |
6 files changed, 139 insertions, 35 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e71ef97d0a7c..c843563914ca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1307,12 +1307,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) * * @objectid: root id * @anon_dev: preallocated anonymous block device number for new roots, - * pass 0 for new allocation. + * pass NULL for a new allocation. * @check_ref: whether to check root item references, If true, return -ENOENT * for orphan roots */ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev, + u64 objectid, dev_t *anon_dev, bool check_ref) { struct btrfs_root *root; @@ -1342,9 +1342,9 @@ again: * that common but still possible. In that case, we just need * to free the anon_dev. */ - if (unlikely(anon_dev)) { - free_anon_bdev(anon_dev); - anon_dev = 0; + if (unlikely(anon_dev && *anon_dev)) { + free_anon_bdev(*anon_dev); + *anon_dev = 0; } if (check_ref && btrfs_root_refs(&root->root_item) == 0) { @@ -1366,7 +1366,7 @@ again: goto fail; } - ret = btrfs_init_fs_root(root, anon_dev); + ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0); if (ret) goto fail; @@ -1402,7 +1402,7 @@ fail: * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ - if (anon_dev) + if (anon_dev && *anon_dev) root->anon_dev = 0; btrfs_put_root(root); return ERR_PTR(ret); @@ -1418,7 +1418,7 @@ fail: struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref) { - return btrfs_get_root_ref(fs_info, objectid, 0, check_ref); + return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref); } /* @@ -1426,11 +1426,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, * the anonymous block device id * * @objectid: tree objectid - * @anon_dev: if zero, allocate a new anonymous block device or use the - * parameter value + * @anon_dev: if NULL, allocate a new anonymous block device or use the + * parameter value if not NULL */ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev) + u64 objectid, dev_t *anon_dev) { return btrfs_get_root_ref(fs_info, objectid, anon_dev, true); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9413726b329b..eb3473d1c1ac 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -61,7 +61,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, - u64 objectid, dev_t anon_dev); + u64 objectid, dev_t *anon_dev); struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e19a2475ab3..8b4bef05e222 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2480,6 +2480,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, u64 offset, u64 phys, u64 len, u32 flags) { + u64 cache_end; int ret = 0; /* Set at the end of extent_fiemap(). */ @@ -2489,15 +2490,102 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, goto assign; /* - * Sanity check, extent_fiemap() should have ensured that new - * fiemap extent won't overlap with cached one. - * Not recoverable. + * When iterating the extents of the inode, at extent_fiemap(), we may + * find an extent that starts at an offset behind the end offset of the + * previous extent we processed. This happens if fiemap is called + * without FIEMAP_FLAG_SYNC and there are ordered extents completing + * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()). * - * NOTE: Physical address can overlap, due to compression + * For example we are in leaf X processing its last item, which is the + * file extent item for file range [512K, 1M[, and after + * btrfs_next_leaf() releases the path, there's an ordered extent that + * completes for the file range [768K, 2M[, and that results in trimming + * the file extent item so that it now corresponds to the file range + * [512K, 768K[ and a new file extent item is inserted for the file + * range [768K, 2M[, which may end up as the last item of leaf X or as + * the first item of the next leaf - in either case btrfs_next_leaf() + * will leave us with a path pointing to the new extent item, for the + * file range [768K, 2M[, since that's the first key that follows the + * last one we processed. So in order not to report overlapping extents + * to user space, we trim the length of the previously cached extent and + * emit it. + * + * Upon calling btrfs_next_leaf() we may also find an extent with an + * offset smaller than or equals to cache->offset, and this happens + * when we had a hole or prealloc extent with several delalloc ranges in + * it, but after btrfs_next_leaf() released the path, delalloc was + * flushed and the resulting ordered extents were completed, so we can + * now have found a file extent item for an offset that is smaller than + * or equals to what we have in cache->offset. We deal with this as + * described below. */ - if (cache->offset + cache->len > offset) { - WARN_ON(1); - return -EINVAL; + cache_end = cache->offset + cache->len; + if (cache_end > offset) { + if (offset == cache->offset) { + /* + * We cached a dealloc range (found in the io tree) for + * a hole or prealloc extent and we have now found a + * file extent item for the same offset. What we have + * now is more recent and up to date, so discard what + * we had in the cache and use what we have just found. + */ + goto assign; + } else if (offset > cache->offset) { + /* + * The extent range we previously found ends after the + * offset of the file extent item we found and that + * offset falls somewhere in the middle of that previous + * extent range. So adjust the range we previously found + * to end at the offset of the file extent item we have + * just found, since this extent is more up to date. + * Emit that adjusted range and cache the file extent + * item we have just found. This corresponds to the case + * where a previously found file extent item was split + * due to an ordered extent completing. + */ + cache->len = offset - cache->offset; + goto emit; + } else { + const u64 range_end = offset + len; + + /* + * The offset of the file extent item we have just found + * is behind the cached offset. This means we were + * processing a hole or prealloc extent for which we + * have found delalloc ranges (in the io tree), so what + * we have in the cache is the last delalloc range we + * found while the file extent item we found can be + * either for a whole delalloc range we previously + * emmitted or only a part of that range. + * + * We have two cases here: + * + * 1) The file extent item's range ends at or behind the + * cached extent's end. In this case just ignore the + * current file extent item because we don't want to + * overlap with previous ranges that may have been + * emmitted already; + * + * 2) The file extent item starts behind the currently + * cached extent but its end offset goes beyond the + * end offset of the cached extent. We don't want to + * overlap with a previous range that may have been + * emmitted already, so we emit the currently cached + * extent and then partially store the current file + * extent item's range in the cache, for the subrange + * going the cached extent's end to the end of the + * file extent item. + */ + if (range_end <= cache_end) + return 0; + + if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC))) + phys += cache_end - offset; + + offset = cache_end; + len = range_end - cache_end; + goto emit; + } } /* @@ -2517,6 +2605,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, return 0; } +emit: /* Not mergeable, need to submit cached one */ ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, cache->len, cache->flags); @@ -2907,17 +2996,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) - goto out_unlock; + goto out; btrfs_release_path(path); path->reada = READA_FORWARD; ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* * No file extent item found, but we may have delalloc between @@ -2964,7 +3051,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, backref_ctx, 0, 0, 0, prev_extent_end, hole_end); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; @@ -3020,7 +3107,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_gen, backref_ctx); if (ret < 0) - goto out_unlock; + goto out; else if (ret > 0) flags |= FIEMAP_EXTENT_SHARED; } @@ -3031,7 +3118,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; @@ -3042,12 +3129,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, next_item: if (fatal_signal_pending(current)) { ret = -EINTR; - goto out_unlock; + goto out; } ret = fiemap_next_leaf_item(inode, path); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* No more file extent items for this inode. */ break; @@ -3071,7 +3158,7 @@ check_eof_delalloc: &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) - goto out_unlock; + goto out; prev_extent_end = range_end; } @@ -3109,9 +3196,6 @@ check_eof_delalloc: } ret = emit_last_fiemap_cache(fieinfo, &cache); - -out_unlock: - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f88e0ca8331d..4795738d5785 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7835,6 +7835,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); int ret; ret = fiemap_prep(inode, fieinfo, start, &len, 0); @@ -7860,7 +7861,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); + btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); + + /* + * We did an initial flush to avoid holding the inode's lock while + * triggering writeback and waiting for the completion of IO and ordered + * extents. Now after we locked the inode we do it again, because it's + * possible a new write may have happened in between those two steps. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); + if (ret) { + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + return ret; + } + } + + ret = extent_fiemap(btrfs_inode, fieinfo, start, len); + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + + return ret; } static int btrfs_writepages(struct address_space *mapping, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ac3316e0d11c..9d1eac15e09e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -721,7 +721,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, free_extent_buffer(leaf); leaf = NULL; - new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); + new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c52807d97efa..bf8e64c766b6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1834,7 +1834,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); + pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); pending->snap = NULL; |