diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-14 17:25:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-14 17:25:36 -0700 |
commit | a3d1f54d7aa4c3be2c6a10768d4ffa1dcb620da9 (patch) | |
tree | 845999b16092da40c30bcffa3b7369637f3c85a6 | |
parent | 47e9bff7fc042b28eb4cf375f0cf249ab708fdfa (diff) | |
parent | 0e39c9e524479b85c1b83134df0cfc6e3cb5353a (diff) |
Merge tag 'for-6.10-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"This update brings a few minor performance improvements, otherwise
there's a lot of refactoring, cleanups and other sort of not user
visible changes.
Performance improvements:
- inline b-tree locking functions, improvement in metadata-heavy
changes
- relax locking on a range that's being reflinked, allows read
operations to run in parallel
- speed up NOCOW write checks (throughput +9% on a sample test)
- extent locking ranges have been reduced in several places, namely
around delayed ref processing
Core:
- more page to folio conversions:
- relocation
- send
- compression
- inline extent handling
- super block write and wait
- extent_map structure optimizations:
- reduced structure size
- code simplifications
- add shrinker for allocated objects, the numbers can go high and
could exhaust memory on smaller systems (reported) as they may
not get an opportunity to be freed fast enough
- extent locking optimizations:
- reduce locking ranges where it does not seem to be necessary and
are safe due to other means of synchronization
- potential improvements due to lower contention,
allocation/freeing and state management operations of extent
state tracking structures
- delayed ref cleanups and simplifications
- updated trace points
- improved error handling, warnings and assertions
- cleanups and refactoring, unification of error handling paths"
* tag 'for-6.10-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (122 commits)
btrfs: qgroup: fix initialization of auto inherit array
btrfs: count super block write errors in device instead of tracking folio error state
btrfs: use the folio iterator in btrfs_end_super_write()
btrfs: convert super block writes to folio in write_dev_supers()
btrfs: convert super block writes to folio in wait_dev_supers()
bio: Export bio_add_folio_nofail to modules
btrfs: remove duplicate included header from fs.h
btrfs: add a cached state to extent_clear_unlock_delalloc
btrfs: push extent lock down in submit_one_async_extent
btrfs: push lock_extent down in cow_file_range()
btrfs: move can_cow_file_range_inline() outside of the extent lock
btrfs: push lock_extent into cow_file_range_inline
btrfs: push extent lock into cow_file_range
btrfs: push extent lock into run_delalloc_cow
btrfs: remove unlock_extent from run_delalloc_compressed
btrfs: push extent lock down in run_delalloc_nocow
btrfs: adjust while loop condition in run_delalloc_nocow
btrfs: push extent lock into run_delalloc_nocow
btrfs: push the extent lock into btrfs_run_delalloc_range
btrfs: lock extent when doing inline extent in compression
...
52 files changed, 2639 insertions, 2346 deletions
diff --git a/block/bio.c b/block/bio.c index 53f608028c78..e9e809a63c59 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1136,6 +1136,7 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, WARN_ON_ONCE(off > UINT_MAX); __bio_add_page(bio, &folio->page, len, off); } +EXPORT_SYMBOL_GPL(bio_add_folio_nofail); /** * bio_add_folio - Attempt to add part of a folio to a bio. diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 58110c968667..a2de5c05f97c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -261,7 +261,7 @@ static void update_share_count(struct share_check *sc, int oldcount, else if (oldcount < 1 && newcount > 0) sc->share_count++; - if (newref->root_id == sc->root->root_key.objectid && + if (newref->root_id == btrfs_root_id(sc->root) && newref->wanted_disk_byte == sc->data_bytenr && newref->key_for_search.objectid == sc->inum) sc->self_ref_count += newref->count; @@ -769,7 +769,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, continue; } - if (sc && ref->root_id != sc->root->root_key.objectid) { + if (sc && ref->root_id != btrfs_root_id(sc->root)) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; @@ -919,40 +919,38 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { /* NORMAL INDIRECT METADATA backref */ - struct btrfs_delayed_tree_ref *ref; struct btrfs_key *key_ptr = NULL; + /* The owner of a tree block ref is the level. */ + int level = btrfs_delayed_ref_owner(node); if (head->extent_op && head->extent_op->update_key) { btrfs_disk_key_to_cpu(&key, &head->extent_op->key); key_ptr = &key; } - ref = btrfs_delayed_node_to_tree_ref(node); - ret = add_indirect_ref(fs_info, preftrees, ref->root, - key_ptr, ref->level + 1, - node->bytenr, count, sc, - GFP_ATOMIC); + ret = add_indirect_ref(fs_info, preftrees, node->ref_root, + key_ptr, level + 1, node->bytenr, + count, sc, GFP_ATOMIC); break; } case BTRFS_SHARED_BLOCK_REF_KEY: { - /* SHARED DIRECT METADATA backref */ - struct btrfs_delayed_tree_ref *ref; - - ref = btrfs_delayed_node_to_tree_ref(node); + /* + * SHARED DIRECT METADATA backref + * + * The owner of a tree block ref is the level. + */ + int level = btrfs_delayed_ref_owner(node); - ret = add_direct_ref(fs_info, preftrees, ref->level + 1, - ref->parent, node->bytenr, count, + ret = add_direct_ref(fs_info, preftrees, level + 1, + node->parent, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_EXTENT_DATA_REF_KEY: { /* NORMAL INDIRECT DATA backref */ - struct btrfs_delayed_data_ref *ref; - ref = btrfs_delayed_node_to_data_ref(node); - - key.objectid = ref->objectid; + key.objectid = btrfs_delayed_ref_owner(node); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = ref->offset; + key.offset = btrfs_delayed_ref_offset(node); /* * If we have a share check context and a reference for @@ -972,18 +970,14 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, if (sc && count < 0) sc->have_delayed_delete_refs = true; - ret = add_indirect_ref(fs_info, preftrees, ref->root, + ret = add_indirect_ref(fs_info, preftrees, node->ref_root, &key, 0, node->bytenr, count, sc, GFP_ATOMIC); break; } case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ - struct btrfs_delayed_data_ref *ref; - - ref = btrfs_delayed_node_to_data_ref(node); - - ret = add_direct_ref(fs_info, preftrees, 0, ref->parent, + ret = add_direct_ref(fs_info, preftrees, 0, node->parent, node->bytenr, count, sc, GFP_ATOMIC); break; @@ -2629,7 +2623,7 @@ static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) btrfs_debug(fs_root->fs_info, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, - fs_root->root_key.objectid); + btrfs_root_id(fs_root)); ret = inode_to_path(parent, name_len, (unsigned long)(iref + 1), eb, ipath); if (ret) @@ -3361,7 +3355,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { btrfs_err(fs_info, "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", - cur->bytenr, level - 1, root->root_key.objectid, + cur->bytenr, level - 1, btrfs_root_id(root), tree_key->objectid, tree_key->type, tree_key->offset); btrfs_put_root(root); ret = -ENOENT; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 95c174f9fd4f..b299b82d676e 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -341,9 +341,9 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) read_lock(&fs_info->global_root_lock); rbtree_postorder_for_each_entry_safe(root, tmp, &fs_info->global_root_tree, rb_node) { - if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID || - root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || - root->root_key.objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_FREE_SPACE_TREE_OBJECTID) { num_bytes += btrfs_root_used(&root->root_item); min_items++; } @@ -406,7 +406,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - switch (root->root_key.objectid) { + switch (btrfs_root_id(root)) { case BTRFS_CSUM_TREE_OBJECTID: case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: @@ -468,8 +468,7 @@ static struct btrfs_block_rsv *get_block_rsv( if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || (root == fs_info->uuid_root) || - (trans->adding_csums && - root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID)) + (trans->adding_csums && btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID)) block_rsv = trans->block_rsv; if (!block_rsv) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 100020ca4658..91c994b569f3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -381,9 +381,11 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) } /* - * Should be called while holding the inode's VFS lock in exclusive mode or in a - * context where no one else can access the inode concurrently (during inode - * creation or when loading an inode from disk). + * Should be called while holding the inode's VFS lock in exclusive mode, or + * while holding the inode's mmap lock (struct btrfs_inode::i_mmap_lock) in + * either shared or exclusive mode, or in a context where no one else can access + * the inode concurrently (during inode creation or when loading an inode from + * disk). */ static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) { @@ -496,7 +498,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); void btrfs_evict_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); @@ -544,6 +545,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); +struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino); extern const struct dentry_operations btrfs_dentry_operations; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b2b94009959d..6441e47d8a5e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -90,20 +90,20 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len) } static int compression_compress_pages(int type, struct list_head *ws, - struct address_space *mapping, u64 start, struct page **pages, - unsigned long *out_pages, unsigned long *total_in, - unsigned long *total_out) + struct address_space *mapping, u64 start, + struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { switch (type) { case BTRFS_COMPRESS_ZLIB: - return zlib_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return zlib_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_LZO: - return lzo_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return lzo_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_ZSTD: - return zstd_compress_pages(ws, mapping, start, pages, - out_pages, total_in, total_out); + return zstd_compress_folios(ws, mapping, start, folios, + out_folios, total_in, total_out); case BTRFS_COMPRESS_NONE: default: /* @@ -115,7 +115,7 @@ static int compression_compress_pages(int type, struct list_head *ws, * Not a big deal, just need to inform caller that we * haven't allocated any pages yet. */ - *out_pages = 0; + *out_folios = 0; return -E2BIG; } } @@ -158,11 +158,11 @@ static int compression_decompress(int type, struct list_head *ws, } } -static void btrfs_free_compressed_pages(struct compressed_bio *cb) +static void btrfs_free_compressed_folios(struct compressed_bio *cb) { - for (unsigned int i = 0; i < cb->nr_pages; i++) - btrfs_free_compr_page(cb->compressed_pages[i]); - kfree(cb->compressed_pages); + for (unsigned int i = 0; i < cb->nr_folios; i++) + btrfs_free_compr_folio(cb->compressed_folios[i]); + kfree(cb->compressed_folios); } static int btrfs_decompress_bio(struct compressed_bio *cb); @@ -223,25 +223,25 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co /* * Common wrappers for page allocation from compression wrappers */ -struct page *btrfs_alloc_compr_page(void) +struct folio *btrfs_alloc_compr_folio(void) { - struct page *page = NULL; + struct folio *folio = NULL; spin_lock(&compr_pool.lock); if (compr_pool.count > 0) { - page = list_first_entry(&compr_pool.list, struct page, lru); - list_del_init(&page->lru); + folio = list_first_entry(&compr_pool.list, struct folio, lru); + list_del_init(&folio->lru); compr_pool.count--; } spin_unlock(&compr_pool.lock); - if (page) - return page; + if (folio) + return folio; - return alloc_page(GFP_NOFS); + return folio_alloc(GFP_NOFS, 0); } -void btrfs_free_compr_page(struct page *page) +void btrfs_free_compr_folio(struct folio *folio) { bool do_free = false; @@ -249,7 +249,7 @@ void btrfs_free_compr_page(struct page *page) if (compr_pool.count > compr_pool.thresh) { do_free = true; } else { - list_add(&page->lru, &compr_pool.list); + list_add(&folio->lru, &compr_pool.list); compr_pool.count++; } spin_unlock(&compr_pool.lock); @@ -257,8 +257,8 @@ void btrfs_free_compr_page(struct page *page) if (!do_free) return; - ASSERT(page_ref_count(page) == 1); - put_page(page); + ASSERT(folio_ref_count(folio) == 1); + folio_put(folio); } static void end_bbio_comprssed_read(struct btrfs_bio *bbio) @@ -269,7 +269,7 @@ static void end_bbio_comprssed_read(struct btrfs_bio *bbio) if (!status) status = errno_to_blk_status(btrfs_decompress_bio(cb)); - btrfs_free_compressed_pages(cb); + btrfs_free_compressed_folios(cb); btrfs_bio_end_io(cb->orig_bbio, status); bio_put(&bbio->bio); } @@ -323,7 +323,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) end_compressed_writeback(cb); /* Note, our inode could be gone now */ - btrfs_free_compressed_pages(cb); + btrfs_free_compressed_folios(cb); bio_put(&cb->bbio.bio); } @@ -342,17 +342,19 @@ static void end_bbio_comprssed_write(struct btrfs_bio *bbio) queue_work(fs_info->compressed_write_workers, &cb->write_end_work); } -static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) +static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { struct bio *bio = &cb->bbio.bio; u32 offset = 0; while (offset < cb->compressed_len) { + int ret; u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); /* Maximum compressed extent is smaller than bio size limit. */ - __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], - len, 0); + ret = bio_add_folio(bio, cb->compressed_folios[offset >> PAGE_SHIFT], + len, 0); + ASSERT(ret); offset += len; } } @@ -367,8 +369,8 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) * the end io hooks. */ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct page **compressed_pages, - unsigned int nr_pages, + struct folio **compressed_folios, + unsigned int nr_folios, blk_opf_t write_flags, bool writeback) { @@ -384,14 +386,14 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, end_bbio_comprssed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; - cb->compressed_pages = compressed_pages; + cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); - cb->nr_pages = nr_pages; + cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; - btrfs_add_compressed_bio_pages(cb); + btrfs_add_compressed_bio_folios(cb); btrfs_submit_bio(&cb->bbio, 0); } @@ -599,14 +601,14 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) free_extent_map(em); - cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); - if (!cb->compressed_pages) { + cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); + if (!cb->compressed_folios) { ret = BLK_STS_RESOURCE; goto out_free_bio; } - ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0); + ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios, 0); if (ret2) { ret = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -618,7 +620,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) /* include any pages we added in add_ra-bio_pages */ cb->len = bbio->bio.bi_iter.bi_size; cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; - btrfs_add_compressed_bio_pages(cb); + btrfs_add_compressed_bio_folios(cb); if (memstall) psi_memstall_leave(&pflags); @@ -627,7 +629,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) return; out_free_compressed_pages: - kfree(cb->compressed_pages); + kfree(cb->compressed_folios); out_free_bio: bio_put(&cb->bbio.bio); out: @@ -974,6 +976,29 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) return level; } +/* Wrapper around find_get_page(), with extra error message. */ +int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, + struct folio **in_folio_ret) +{ + struct folio *in_folio; + + /* + * The compressed write path should have the folio locked already, thus + * we only need to grab one reference. + */ + in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT); + if (IS_ERR(in_folio)) { + struct btrfs_inode *inode = BTRFS_I(mapping->host); + + btrfs_crit(inode->root->fs_info, + "failed to get page cache, root %lld ino %llu file offset %llu", + btrfs_root_id(inode->root), btrfs_ino(inode), start); + return -ENOENT; + } + *in_folio_ret = in_folio; + return 0; +} + /* * Given an address space and start and length, compress the bytes into @pages * that are allocated on demand. @@ -994,11 +1019,9 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, - u64 start, struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out) +int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { int type = btrfs_compress_type(type_level); int level = btrfs_compress_level(type_level); @@ -1007,8 +1030,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, level = btrfs_compress_set_level(type, level); workspace = get_workspace(type, level); - ret = compression_compress_pages(type, workspace, mapping, start, pages, - out_pages, total_in, total_out); + ret = compression_compress_pages(type, workspace, mapping, start, folios, + out_folios, total_in, total_out); put_workspace(type, workspace); return ret; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 4691a84ca838..c20c1a1b09d5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -41,11 +41,11 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of compressed pages in the array */ - unsigned int nr_pages; + /* Number of compressed folios in the array. */ + unsigned int nr_folios; - /* the pages with the compressed data on them */ - struct page **compressed_pages; + /* The folios with the compressed data on them. */ + struct folio **compressed_folios; /* starting offset in the inode for our pages */ u64 start; @@ -85,27 +85,24 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, - u64 start, struct page **pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out); +int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, - struct page **compressed_pages, - unsigned int nr_pages, - blk_opf_t write_flags, - bool writeback); + struct folio **compressed_folios, + unsigned int nr_folios, blk_opf_t write_flags, + bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); -struct page *btrfs_alloc_compr_page(void); -void btrfs_free_compr_page(struct page *page); +struct folio *btrfs_alloc_compr_folio(void); +void btrfs_free_compr_folio(struct folio *folio); enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, @@ -149,8 +146,11 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, + struct folio **in_folio_ret); + +int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zlib_decompress(struct list_head *ws, const u8 *data_in, @@ -160,8 +160,8 @@ struct list_head *zlib_alloc_workspace(unsigned int level); void zlib_free_workspace(struct list_head *ws); struct list_head *zlib_get_workspace(unsigned int level); -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int lzo_decompress(struct list_head *ws, const u8 *data_in, @@ -170,8 +170,8 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, struct list_head *lzo_alloc_workspace(unsigned int level); void lzo_free_workspace(struct list_head *ws); -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, +int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); int zstd_decompress(struct list_head *ws, const u8 *data_in, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index aaf53fd84358..1a49b9232990 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -291,7 +291,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root) spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ - if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID) + if (btrfs_root_id(root) == BTRFS_EXTENT_TREE_OBJECTID) list_move_tail(&root->dirty_list, &fs_info->dirty_cowonly_roots); else @@ -454,7 +454,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } } else { refs = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; else @@ -466,15 +466,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); if (refs > 1) { - if ((owner == root->root_key.objectid || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + if ((owner == btrfs_root_id(root) || + btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1); if (ret) return ret; - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0); if (ret) return ret; @@ -485,8 +484,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); @@ -500,8 +498,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, } } else { if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); @@ -563,13 +560,13 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { if (parent) parent_start = parent->start; reloc_src_root = btrfs_header_owner(buf); } cow = btrfs_alloc_tree_block(trans, root, parent_start, - root->root_key.objectid, &disk_key, level, + btrfs_root_id(root), &disk_key, level, search_start, empty_size, reloc_src_root, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -582,10 +579,10 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | BTRFS_HEADER_FLAG_RELOC); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); else - btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_set_header_owner(cow, btrfs_root_id(root)); write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); @@ -609,7 +606,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, if (buf == root->node) { WARN_ON(parent && parent != buf); - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; @@ -685,7 +682,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && - !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + !(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && !test_bit(BTRFS_ROOT_FORCE_COW, &root->state)) return 0; @@ -1003,7 +1000,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -1021,7 +1018,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, BTRFS_NESTING_RIGHT_COW); @@ -1205,7 +1202,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (IS_ERR(left)) return PTR_ERR(left); - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); left_nr = btrfs_header_nritems(left); if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -1265,7 +1262,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (IS_ERR(right)) return PTR_ERR(right); - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); right_nr = btrfs_header_nritems(right); if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { @@ -1511,7 +1508,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, check.has_first_key = true; check.level = parent_level - 1; check.transid = gen; - check.owner_root = root->root_key.objectid; + check.owner_root = btrfs_root_id(root); /* * If we need to read an extent buffer from disk and we are holding locks @@ -1556,7 +1553,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, btrfs_release_path(p); return -EIO; } - if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { + if (btrfs_check_eb_owner(tmp, btrfs_root_id(root))) { free_extent_buffer(tmp); btrfs_release_path(p); return -EUCLEAN; @@ -2865,7 +2862,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + c = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), &lower_key, level, root->node->start, 0, 0, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) @@ -3009,7 +3006,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + split = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), &disk_key, level, c->start, 0, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) @@ -3267,7 +3264,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return PTR_ERR(right); - __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + btrfs_tree_lock_nested(right, BTRFS_NESTING_RIGHT); free_space = btrfs_leaf_free_space(right); if (free_space < data_size) @@ -3483,7 +3480,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return PTR_ERR(left); - __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + btrfs_tree_lock_nested(left, BTRFS_NESTING_LEFT); free_space = btrfs_leaf_free_space(left); if (free_space < data_size) { @@ -3761,7 +3758,7 @@ again: * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just * use BTRFS_NESTING_NEW_ROOT. */ - right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + right = btrfs_alloc_tree_block(trans, root, 0, btrfs_root_id(root), &disk_key, 0, l->start, 0, 0, num_doubles ? BTRFS_NESTING_NEW_ROOT : BTRFS_NESTING_SPLIT); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index f015fa1b6301..407ccec3e57e 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -147,7 +147,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->ino = btrfs_ino(inode); defrag->transid = transid; - defrag->root = root->root_key.objectid; + defrag->root = btrfs_root_id(root); defrag->extent_thresh = extent_thresh; spin_lock(&fs_info->defrag_inodes_lock); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 121ab890bd05..95a0497fa866 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1651,7 +1651,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - index, node->root->root_key.objectid, + index, btrfs_root_id(node->root), node->inode_id, ret); btrfs_delayed_item_release_metadata(dir->root, item); btrfs_release_delayed_item(item); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e44e62cf76bc..6cc80fb10da2 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -16,8 +16,7 @@ #include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; -struct kmem_cache *btrfs_delayed_tree_ref_cachep; -struct kmem_cache *btrfs_delayed_data_ref_cachep; +struct kmem_cache *btrfs_delayed_ref_node_cachep; struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * delayed back reference update tracking. For subvolume trees @@ -305,50 +304,19 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, } /* - * compare two delayed tree backrefs with same bytenr and type - */ -static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, - struct btrfs_delayed_tree_ref *ref2) -{ - if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } - return 0; -} - -/* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, - struct btrfs_delayed_data_ref *ref2) +static int comp_data_refs(struct btrfs_delayed_ref_node *ref1, + struct btrfs_delayed_ref_node *ref2) { - if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - if (ref1->objectid < ref2->objectid) - return -1; - if (ref1->objectid > ref2->objectid) - return 1; - if (ref1->offset < ref2->offset) - return -1; - if (ref1->offset > ref2->offset) - return 1; - } else { - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; - } + if (ref1->data_ref.objectid < ref2->data_ref.objectid) + return -1; + if (ref1->data_ref.objectid > ref2->data_ref.objectid) + return 1; + if (ref1->data_ref.offset < ref2->data_ref.offset) + return -1; + if (ref1->data_ref.offset > ref2->data_ref.offset) + return 1; return 0; } @@ -362,13 +330,20 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1, return -1; if (ref1->type > ref2->type) return 1; - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), - btrfs_delayed_node_to_tree_ref(ref2)); - else - ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), - btrfs_delayed_node_to_data_ref(ref2)); + if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } else { + if (ref1->ref_root < ref2->ref_root) + return -1; + if (ref1->ref_root > ref2->ref_root) + return -1; + if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY) + ret = comp_data_refs(ref1, ref2); + } if (ret) return ret; if (check_seq) { @@ -828,18 +803,20 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, } static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, + struct btrfs_ref *generic_ref, struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, u64 ref_root, - u64 reserved, int action, bool is_data, - bool is_system, u64 owning_root) + u64 reserved) { int count_mod = 1; bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ - BUG_ON(!is_data && reserved); + BUG_ON(generic_ref->type != BTRFS_REF_DATA && reserved); - switch (action) { + switch (generic_ref->action) { + case BTRFS_ADD_DELAYED_REF: + /* count_mod is already set to 1. */ + break; case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; break; @@ -868,14 +845,14 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, } refcount_set(&head_ref->refs, 1); - head_ref->bytenr = bytenr; - head_ref->num_bytes = num_bytes; + head_ref->bytenr = generic_ref->bytenr; + head_ref->num_bytes = generic_ref->num_bytes; head_ref->ref_mod = count_mod; head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; - head_ref->owning_root = owning_root; - head_ref->is_data = is_data; - head_ref->is_system = is_system; + head_ref->owning_root = generic_ref->owning_root; + head_ref->is_data = (generic_ref->type == BTRFS_REF_DATA); + head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID); head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); @@ -885,12 +862,12 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, mutex_init(&head_ref->mutex); if (qrecord) { - if (ref_root && reserved) { + if (generic_ref->ref_root && reserved) { qrecord->data_rsv = reserved; - qrecord->data_rsv_refroot = ref_root; + qrecord->data_rsv_refroot = generic_ref->ref_root; } - qrecord->bytenr = bytenr; - qrecord->num_bytes = num_bytes; + qrecord->bytenr = generic_ref->bytenr; + qrecord->num_bytes = generic_ref->num_bytes; qrecord->old_roots = NULL; } } @@ -982,49 +959,45 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, */ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 ref_root, - int action, u8 ref_type) + struct btrfs_ref *generic_ref) { + int action = generic_ref->action; u64 seq = 0; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; - if (is_fstree(ref_root)) + if (is_fstree(generic_ref->ref_root)) seq = atomic64_read(&fs_info->tree_mod_seq); refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; + ref->bytenr = generic_ref->bytenr; + ref->num_bytes = generic_ref->num_bytes; ref->ref_mod = 1; ref->action = action; ref->seq = seq; - ref->type = ref_type; + ref->type = btrfs_ref_type(generic_ref); + ref->ref_root = generic_ref->ref_root; + ref->parent = generic_ref->parent; RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); -} -void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, - u64 len, u64 parent, u64 owning_root) -{ - generic_ref->action = action; - generic_ref->bytenr = bytenr; - generic_ref->len = len; - generic_ref->parent = parent; - generic_ref->owning_root = owning_root; + if (generic_ref->type == BTRFS_REF_DATA) + ref->data_ref = generic_ref->data_ref; + else + ref->tree_ref = generic_ref->tree_ref; } -void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, - u64 mod_root, bool skip_qgroup) +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, + bool skip_qgroup) { #ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: root; + generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif generic_ref->tree_ref.level = level; - generic_ref->tree_ref.ref_root = root; generic_ref->type = BTRFS_REF_METADATA; - if (skip_qgroup || !(is_fstree(root) && + if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && (!mod_root || is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else @@ -1032,85 +1005,58 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, } -void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ref_root, u64 ino, - u64 offset, u64 mod_root, bool skip_qgroup) +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, + u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ - generic_ref->real_root = mod_root ?: ref_root; + generic_ref->real_root = mod_root ?: generic_ref->ref_root; #endif - generic_ref->data_ref.ref_root = ref_root; - generic_ref->data_ref.ino = ino; + generic_ref->data_ref.objectid = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; - if (skip_qgroup || !(is_fstree(ref_root) && + if (skip_qgroup || !(is_fstree(generic_ref->ref_root) && (!mod_root || is_fstree(mod_root)))) generic_ref->skip_qgroup = true; else generic_ref->skip_qgroup = false; } -/* - * add a delayed tree ref. This does all of the accounting required - * to make sure the delayed ref is eventually processed before this - * transaction commits. - */ -int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op) +static int add_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + struct btrfs_delayed_extent_op *extent_op, + u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_tree_ref *ref; + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; bool qrecord_inserted; - bool is_system; - bool merged; int action = generic_ref->action; - int level = generic_ref->tree_ref.level; - u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; - u64 parent = generic_ref->parent; - u8 ref_type; - - is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID); + bool merged; - ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); - ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); - if (!ref) + node = kmem_cache_alloc(btrfs_delayed_ref_node_cachep, GFP_NOFS); + if (!node) return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); return -ENOMEM; } if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); return -ENOMEM; } } - if (parent) - ref_type = BTRFS_SHARED_BLOCK_REF_KEY; - else - ref_type = BTRFS_TREE_BLOCK_REF_KEY; - - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.ref_root, action, - ref_type); - ref->root = generic_ref->tree_ref.ref_root; - ref->parent = parent; - ref->level = level; - - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.ref_root, 0, action, - false, is_system, generic_ref->owning_root); + init_delayed_ref_common(fs_info, node, generic_ref); + init_delayed_ref_head(head_ref, generic_ref, record, reserved); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -1123,7 +1069,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - merged = insert_delayed_ref(trans, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, node); spin_unlock(&delayed_refs->lock); /* @@ -1132,107 +1078,39 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, */ btrfs_update_delayed_refs_rsv(trans); - trace_add_delayed_tree_ref(fs_info, &ref->node, ref, - action == BTRFS_ADD_DELAYED_EXTENT ? - BTRFS_ADD_DELAYED_REF : action); + if (generic_ref->type == BTRFS_REF_DATA) + trace_add_delayed_data_ref(trans->fs_info, node); + else + trace_add_delayed_tree_ref(trans->fs_info, node); if (merged) - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_node_cachep, node); if (qrecord_inserted) - btrfs_qgroup_trace_extent_post(trans, record); - + return btrfs_qgroup_trace_extent_post(trans, record); return 0; } /* + * Add a delayed tree ref. This does all of the accounting required to make sure + * the delayed ref is eventually processed before this transaction commits. + */ +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_ref *generic_ref, + struct btrfs_delayed_extent_op *extent_op) +{ + ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); + return add_delayed_ref(trans, generic_ref, extent_op, 0); +} + +/* * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, u64 reserved) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_data_ref *ref; - struct btrfs_delayed_ref_head *head_ref; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_qgroup_extent_record *record = NULL; - bool qrecord_inserted; - int action = generic_ref->action; - bool merged; - u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; - u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.ref_root; - u64 owner = generic_ref->data_ref.ino; - u64 offset = generic_ref->data_ref.offset; - u8 ref_type; - - ASSERT(generic_ref->type == BTRFS_REF_DATA && action); - ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); - if (!ref) - return -ENOMEM; - - if (parent) - ref_type = BTRFS_SHARED_DATA_REF_KEY; - else - ref_type = BTRFS_EXTENT_DATA_REF_KEY; - init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - ref_root, action, ref_type); - ref->root = ref_root; - ref->parent = parent; - ref->objectid = owner; - ref->offset = offset; - - - head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - return -ENOMEM; - } - - if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { - record = kzalloc(sizeof(*record), GFP_NOFS); - if (!record) { - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - kmem_cache_free(btrfs_delayed_ref_head_cachep, - head_ref); - return -ENOMEM; - } - } - - init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, - reserved, action, true, false, generic_ref->owning_root); - head_ref->extent_op = NULL; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - - /* - * insert both the head node and the new ref without dropping - * the spin lock - */ - head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted); - - merged = insert_delayed_ref(trans, head_ref, &ref->node); - spin_unlock(&delayed_refs->lock); - - /* - * Need to update the delayed_refs_rsv with any changes we may have - * made. - */ - btrfs_update_delayed_refs_rsv(trans); - - trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, - action == BTRFS_ADD_DELAYED_EXTENT ? - BTRFS_ADD_DELAYED_REF : action); - if (merged) - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - - - if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(trans, record); - return 0; + ASSERT(generic_ref->type == BTRFS_REF_DATA && generic_ref->action); + return add_delayed_ref(trans, generic_ref, NULL, reserved); } int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, @@ -1241,13 +1119,18 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_ref generic_ref = { + .type = BTRFS_REF_METADATA, + .action = BTRFS_UPDATE_DELAYED_HEAD, + .bytenr = bytenr, + .num_bytes = num_bytes, + }; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); if (!head_ref) return -ENOMEM; - init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, false, false, 0); + init_delayed_ref_head(head_ref, &generic_ref, NULL, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -1270,18 +1153,7 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { if (refcount_dec_and_test(&ref->refs)) { WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); - switch (ref->type) { - case BTRFS_TREE_BLOCK_REF_KEY: - case BTRFS_SHARED_BLOCK_REF_KEY: - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - break; - case BTRFS_EXTENT_DATA_REF_KEY: - case BTRFS_SHARED_DATA_REF_KEY: - kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); - break; - default: - BUG(); - } + kmem_cache_free(btrfs_delayed_ref_node_cachep, ref); } } @@ -1300,8 +1172,7 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt void __cold btrfs_delayed_ref_exit(void) { kmem_cache_destroy(btrfs_delayed_ref_head_cachep); - kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); - kmem_cache_destroy(btrfs_delayed_data_ref_cachep); + kmem_cache_destroy(btrfs_delayed_ref_node_cachep); kmem_cache_destroy(btrfs_delayed_extent_op_cachep); } @@ -1311,12 +1182,8 @@ int __init btrfs_delayed_ref_init(void) if (!btrfs_delayed_ref_head_cachep) goto fail; - btrfs_delayed_tree_ref_cachep = KMEM_CACHE(btrfs_delayed_tree_ref, 0); - if (!btrfs_delayed_tree_ref_cachep) - goto fail; - - btrfs_delayed_data_ref_cachep = KMEM_CACHE(btrfs_delayed_data_ref, 0); - if (!btrfs_delayed_data_ref_cachep) + btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); + if (!btrfs_delayed_ref_node_cachep) goto fail; btrfs_delayed_extent_op_cachep = KMEM_CACHE(btrfs_delayed_extent_op, 0); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b291147cb8ab..04b180ebe1fe 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -30,6 +30,32 @@ enum btrfs_delayed_ref_action { BTRFS_UPDATE_DELAYED_HEAD, } __packed; +struct btrfs_data_ref { + /* For EXTENT_DATA_REF */ + + /* Inode which refers to this data extent */ + u64 objectid; + + /* + * file_offset - extent_offset + * + * file_offset is the key.offset of the EXTENT_DATA key. + * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. + */ + u64 offset; +}; + +struct btrfs_tree_ref { + /* + * Level of this tree block. + * + * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. + */ + int level; + + /* For non-skinny metadata, no special member needed */ +}; + struct btrfs_delayed_ref_node { struct rb_node ref_node; /* @@ -48,6 +74,15 @@ struct btrfs_delayed_ref_node { /* seq number to keep track of insertion order */ u64 seq; + /* The ref_root for this ref */ + u64 ref_root; + + /* + * The parent for this ref, if this isn't set the ref_root is the + * reference owner. + */ + u64 parent; + /* ref count on this data structure */ refcount_t refs; @@ -64,6 +99,11 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; + + union { + struct btrfs_tree_ref tree_ref; + struct btrfs_data_ref data_ref; + }; }; struct btrfs_delayed_extent_op { @@ -151,21 +191,6 @@ struct btrfs_delayed_ref_head { bool processing; }; -struct btrfs_delayed_tree_ref { - struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; - int level; -}; - -struct btrfs_delayed_data_ref { - struct btrfs_delayed_ref_node node; - u64 root; - u64 parent; - u64 objectid; - u64 offset; -}; - enum btrfs_delayed_ref_flags { /* Indicate that we are flushing delayed refs for the commit */ BTRFS_DELAYED_REFS_FLUSHING, @@ -214,42 +239,6 @@ enum btrfs_ref_type { BTRFS_REF_LAST, } __packed; -struct btrfs_data_ref { - /* For EXTENT_DATA_REF */ - - /* Root which owns this data reference. */ - u64 ref_root; - - /* Inode which refers to this data extent */ - u64 ino; - - /* - * file_offset - extent_offset - * - * file_offset is the key.offset of the EXTENT_DATA key. - * extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data. - */ - u64 offset; -}; - -struct btrfs_tree_ref { - /* - * Level of this tree block - * - * Shared for skinny (TREE_BLOCK_REF) and normal tree ref. - */ - int level; - - /* - * Root which owns this tree block reference. - * - * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) - */ - u64 ref_root; - - /* For non-skinny metadata, no special member needed */ -}; - struct btrfs_ref { enum btrfs_ref_type type; enum btrfs_delayed_ref_action action; @@ -267,9 +256,15 @@ struct btrfs_ref { u64 real_root; #endif u64 bytenr; - u64 len; + u64 num_bytes; u64 owning_root; + /* + * The root that owns the reference for this reference, this will be set + * or ->parent will be set, depending on what type of reference this is. + */ + u64 ref_root; + /* Bytenr of the parent tree block */ u64 parent; union { @@ -279,8 +274,7 @@ struct btrfs_ref { }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; -extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; -extern struct kmem_cache *btrfs_delayed_data_ref_cachep; +extern struct kmem_cache *btrfs_delayed_ref_node_cachep; extern struct kmem_cache *btrfs_delayed_extent_op_cachep; int __init btrfs_delayed_ref_init(void); @@ -318,12 +312,10 @@ static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info * return btrfs_calc_metadata_size(fs_info, num_csum_items); } -void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, int action, u64 bytenr, - u64 len, u64 parent, u64 owning_root); -void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 root, +void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root, + bool skip_qgroup); +void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset, u64 mod_root, bool skip_qgroup); -void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ref_root, u64 ino, - u64 offset, u64 mod_root, bool skip_qgroup); static inline struct btrfs_delayed_extent_op * btrfs_alloc_delayed_extent_op(void) @@ -398,19 +390,39 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, u64 num_bytes); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); -/* - * helper functions to cast a node into its container - */ -static inline struct btrfs_delayed_tree_ref * -btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) +static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) +{ + if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + return node->data_ref.objectid; + return node->tree_ref.level; +} + +static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node) { - return container_of(node, struct btrfs_delayed_tree_ref, node); + if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + return node->data_ref.offset; + return 0; } -static inline struct btrfs_delayed_data_ref * -btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +static inline u8 btrfs_ref_type(struct btrfs_ref *ref) { - return container_of(node, struct btrfs_delayed_data_ref, node); + ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA); + + if (ref->type == BTRFS_REF_DATA) { + if (ref->parent) + return BTRFS_SHARED_DATA_REF_KEY; + else + return BTRFS_EXTENT_DATA_REF_KEY; + } else { + if (ref->parent) + return BTRFS_SHARED_BLOCK_REF_KEY; + else + return BTRFS_TREE_BLOCK_REF_KEY; + } + + return 0; } #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3df5477d48a8..a91a8056758a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -646,7 +646,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { - bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + bool dummy = btrfs_is_testing(fs_info); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -663,8 +663,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - /* GFP flags are compatible with XA_FLAGS_*. */ - xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); + xa_init(&root->delayed_nodes); btrfs_init_root_block_rsv(root); @@ -776,7 +775,7 @@ int btrfs_global_root_insert(struct btrfs_root *root) if (tmp) { ret = -EEXIST; btrfs_warn(fs_info, "global root %llu %llu already exists", - root->root_key.objectid, root->root_key.offset); + btrfs_root_id(root), root->root_key.offset); } return ret; } @@ -1012,7 +1011,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, } log_root->last_trans = trans->transid; - log_root->root_key.offset = root->root_key.objectid; + log_root->root_key.offset = btrfs_root_id(root); inode_item = &log_root->root_item.inode; btrfs_set_stack_inode_generation(inode_item, 1); @@ -1076,15 +1075,15 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, * For real fs, and not log/reloc trees, root owner must * match its root node owner */ - if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - root->root_key.objectid != btrfs_header_owner(root->node)) { + if (!btrfs_is_testing(fs_info) && + btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && + btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && + btrfs_root_id(root) != btrfs_header_owner(root->node)) { btrfs_crit(fs_info, "root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", - root->root_key.objectid, root->node->start, + btrfs_root_id(root), root->node->start, btrfs_header_owner(root->node), - root->root_key.objectid); + btrfs_root_id(root)); ret = -EUCLEAN; goto fail; } @@ -1121,9 +1120,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) btrfs_drew_lock_init(&root->snapshot_lock); - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID && !btrfs_is_data_reloc_root(root) && - is_fstree(root->root_key.objectid)) { + is_fstree(btrfs_root_id(root))) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1132,7 +1131,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ - if (is_fstree(root->root_key.objectid) && + if (is_fstree(btrfs_root_id(root)) && btrfs_root_refs(&root->root_item) > 0) { if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); @@ -1219,7 +1218,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), root); if (ret == 0) { btrfs_grab_root(root); @@ -1266,9 +1265,14 @@ static void free_global_roots(struct btrfs_fs_info *fs_info) void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { + struct percpu_counter *em_counter = &fs_info->evictable_extent_maps; + percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->ordered_bytes); + if (percpu_counter_initialized(em_counter)) + ASSERT(percpu_counter_sum_positive(em_counter) == 0); + percpu_counter_destroy(em_counter); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -2584,7 +2588,7 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev struct btrfs_tree_parent_check check = { .level = level, .transid = gen, - .owner_root = root->root_key.objectid + .owner_root = btrfs_root_id(root) }; int ret = 0; @@ -2848,6 +2852,10 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block if (ret) return ret; + ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL); + if (ret) + return ret; + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -2930,7 +2938,7 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->fs_roots_radix_lock); break; } - root_objectid = gang[ret - 1]->root_key.objectid + 1; + root_objectid = btrfs_root_id(gang[ret - 1]) + 1; for (i = 0; i < ret; i++) { /* Avoid to grab roots in dead_roots. */ @@ -2946,7 +2954,7 @@ static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; + root_objectid = btrfs_root_id(gang[i]); err = btrfs_orphan_cleanup(gang[i]); if (err) goto out; @@ -3618,28 +3626,25 @@ ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_super_write(struct bio *bio) { struct btrfs_device *device = bio->bi_private; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - struct page *page; - - bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; + struct folio_iter fi; + bio_for_each_folio_all(fi, bio) { if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, - "lost page write due to IO error on %s (%d)", + "lost super block write due to IO error on %s (%d)", btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); - ClearPageUptodate(page); - SetPageError(page); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); - } else { - SetPageUptodate(page); + /* Ensure failure if the primary sb fails. */ + if (bio->bi_opf & REQ_FUA) + atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR, + &device->sb_write_errors); + else + atomic_inc(&device->sb_write_errors); } - - put_page(page); - unlock_page(page); + folio_unlock(fi.folio); + folio_put(fi.folio); } bio_put(bio); @@ -3726,13 +3731,13 @@ struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) /* * Write superblock @sb to the @device. Do not wait for completion, all the - * pages we use for writing are locked. + * folios we use for writing are locked. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * - * Return number of errors when page is not found or submission fails. + * Return number of errors when folio is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) @@ -3741,19 +3746,21 @@ static int write_dev_supers(struct btrfs_device *device, struct address_space *mapping = device->bdev->bd_inode->i_mapping; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); int i; - int errors = 0; int ret; u64 bytenr, bytenr_orig; + atomic_set(&device->sb_write_errors, 0); + if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; shash->tfm = fs_info->csum_shash; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; struct bio *bio; struct btrfs_super_block *disk_super; + size_t offset; bytenr_orig = btrfs_sb_offset(i); ret = btrfs_sb_log_location(device, i, WRITE, &bytenr); @@ -3763,7 +3770,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_err(device->fs_info, "couldn't get super block location for mirror %d", i); - errors++; + atomic_inc(&device->sb_write_errors); continue; } if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3776,20 +3783,20 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); - page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, - GFP_NOFS); - if (!page) { + folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + GFP_NOFS); + if (IS_ERR(folio)) { btrfs_err(device->fs_info, "couldn't get super block page for bytenr %llu", bytenr); - errors++; + atomic_inc(&device->sb_write_errors); continue; } + ASSERT(folio_order(folio) == 0); - /* Bump the refcount for wait_dev_supers() */ - get_page(page); - - disk_super = page_address(page); + offset = offset_in_folio(folio, bytenr); + disk_super = folio_address(folio) + offset; memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE); /* @@ -3803,8 +3810,7 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; - __bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE, - offset_in_page(bytenr)); + bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset); /* * We FUA only the first super block. The others we allow to @@ -3816,17 +3822,17 @@ static int write_dev_supers(struct btrfs_device *device, submit_bio(bio); if (btrfs_advance_sb_log(device, i)) - errors++; + atomic_inc(&device->sb_write_errors); } - return errors < i ? 0 : -1; + return atomic_read(&device->sb_write_errors) < i ? 0 : -1; } /* * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * - * Return number of errors when page is not found or not marked up to - * date. + * Return -1 if primary super block write failed or when there were no super block + * copies written. Otherwise 0. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { @@ -3840,7 +3846,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { @@ -3855,30 +3861,21 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - page = find_get_page(device->bdev->bd_inode->i_mapping, - bytenr >> PAGE_SHIFT); - if (!page) { - errors++; - if (i == 0) - primary_failed = true; + folio = filemap_get_folio(device->bdev->bd_inode->i_mapping, + bytenr >> PAGE_SHIFT); + /* If the folio has been removed, then we know it completed. */ + if (IS_ERR(folio)) continue; - } - /* Page is submitted locked and unlocked once the IO completes */ - wait_on_page_locked(page); - if (PageError(page)) { - errors++; - if (i == 0) - primary_failed = true; - } - - /* Drop our reference */ - put_page(page); + ASSERT(folio_order(folio) == 0); - /* Drop the reference from the writing run */ - put_page(page); + /* Folio will be unlocked once the write completes. */ + folio_wait_locked(folio); + folio_put(folio); } - /* log error, force error return */ + errors += atomic_read(&device->sb_write_errors); + if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR) + primary_failed = true; if (primary_failed) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", device->devid); @@ -4139,7 +4136,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); + (unsigned long)btrfs_root_id(root)); if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); @@ -4182,9 +4179,6 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) struct btrfs_transaction *tmp; bool found = false; - if (list_empty(&fs_info->trans_list)) - return; - /* * This function is only called at the very end of close_ctree(), * thus no other running transaction, no need to take trans_lock. @@ -4484,7 +4478,7 @@ static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) { if (!gang[i]) continue; - root_objectid = gang[i]->root_key.objectid; + root_objectid = btrfs_root_id(gang[i]); btrfs_free_log(NULL, gang[i]); btrfs_put_root(gang[i]); } @@ -4815,7 +4809,7 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) btrfs_qgroup_free_meta_all_pertrans(root); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); } } @@ -4844,14 +4838,10 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&fs_info->transaction_wait); - btrfs_destroy_delayed_inodes(fs_info); - btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); - btrfs_free_all_qgroup_pertrans(fs_info); - cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4904,6 +4894,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_all_delalloc_inodes(fs_info); btrfs_drop_all_logs(fs_info); + btrfs_free_all_qgroup_pertrans(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; @@ -4959,7 +4950,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", - root->root_key.objectid); + btrfs_root_id(root)); ret = -ENOSPC; goto out; } diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 8398d345ec5b..9e81f89e76d8 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -34,7 +34,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, type = FILEID_BTRFS_WITHOUT_PARENT; fid->objectid = btrfs_ino(BTRFS_I(inode)); - fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid; + fid->root_objectid = btrfs_root_id(BTRFS_I(inode)->root); fid->gen = inode->i_generation; if (parent) { @@ -42,7 +42,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, fid->parent_objectid = BTRFS_I(parent)->location.objectid; fid->parent_gen = parent->i_generation; - parent_root_id = BTRFS_I(parent)->root->root_key.objectid; + parent_root_id = btrfs_root_id(BTRFS_I(parent)->root); if (parent_root_id != fid->root_objectid) { fid->parent_root_objectid = parent_root_id; @@ -160,7 +160,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) return ERR_PTR(-ENOMEM); if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = root->root_key.objectid; + key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; @@ -243,7 +243,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.objectid = btrfs_root_id(BTRFS_I(inode)->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index c09b428823d7..ed2cfc3d5d8a 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1059,7 +1059,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *prealloc = NULL; struct rb_node **p = NULL; struct rb_node *parent = NULL; - int err = 0; + int ret = 0; u64 last_start; u64 last_end; u32 exclusive_bits = (bits & EXTENT_LOCKED); @@ -1122,7 +1122,7 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = state->start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1158,7 +1158,7 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1175,12 +1175,12 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, start); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (err) + if (ret) goto out; if (state->end <= end) { set_state_bits(tree, state, bits, changeset); @@ -1224,8 +1224,8 @@ hit_next: prealloc->end = this_end; inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { - err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, prealloc, "insert", err); + ret = PTR_ERR(inserted_state); + extent_io_tree_panic(tree, prealloc, "insert", ret); } cache_state(inserted_state, cached_state); @@ -1244,16 +1244,16 @@ hit_next: if (state->state & exclusive_bits) { *failed_start = start; cache_state(state, failed_state); - err = -EEXIST; + ret = -EEXIST; goto out; } prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1275,7 +1275,7 @@ out: if (prealloc) free_extent_state(prealloc); - return err; + return ret; } @@ -1312,7 +1312,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *prealloc = NULL; struct rb_node **p = NULL; struct rb_node *parent = NULL; - int err = 0; + int ret = 0; u64 last_start; u64 last_end; bool first_iteration = true; @@ -1351,7 +1351,7 @@ again: if (!state) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } prealloc->start = start; @@ -1402,14 +1402,14 @@ hit_next: if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, start); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (err) + if (ret) goto out; if (state->end <= end) { set_state_bits(tree, state, bits, NULL); @@ -1442,7 +1442,7 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -1454,8 +1454,8 @@ hit_next: prealloc->end = this_end; inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { - err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, prealloc, "insert", err); + ret = PTR_ERR(inserted_state); + extent_io_tree_panic(tree, prealloc, "insert", ret); } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) @@ -1472,13 +1472,13 @@ hit_next: if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) + extent_io_tree_panic(tree, state, "split", ret); set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); @@ -1500,7 +1500,7 @@ out: if (prealloc) free_extent_state(prealloc); - return err; + return ret; } /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 257d044bca91..47d48233b592 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -46,9 +46,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, + struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -448,9 +446,8 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; u32 nritems; - int ret; int recow; - int err = -ENOENT; + int ret; key.objectid = bytenr; if (parent) { @@ -464,26 +461,26 @@ static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, again: recow = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) { - err = ret; - goto fail; - } + if (ret < 0) + return ret; if (parent) { - if (!ret) - return 0; - goto fail; + if (ret) + return -ENOENT; + return 0; } + ret = -ENOENT; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); - if (ret < 0) - err = ret; - if (ret) - goto fail; + if (ret) { + if (ret > 1) + return -ENOENT; + return ret; + } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -504,37 +501,37 @@ again: btrfs_release_path(path); goto again; } - err = 0; + ret = 0; break; } path->slots[0]++; } fail: - return err; + return ret; } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid, u64 owner, - u64 offset, int refs_to_add) + struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u32 size; u32 num_refs; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; - key.offset = parent; + key.offset = node->parent; size = sizeof(struct btrfs_shared_data_ref); } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; - key.offset = hash_extent_data_ref(root_objectid, - owner, offset); + key.offset = hash_extent_data_ref(node->ref_root, owner, offset); size = sizeof(struct btrfs_extent_data_ref); } @@ -543,15 +540,15 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, goto fail; leaf = path->nodes[0]; - if (parent) { + if (node->parent) { struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); if (ret == 0) { - btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_shared_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } } else { @@ -559,7 +556,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, while (ret == -EEXIST) { ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); - if (match_extent_data_ref(leaf, ref, root_objectid, + if (match_extent_data_ref(leaf, ref, node->ref_root, owner, offset)) break; btrfs_release_path(path); @@ -574,14 +571,13 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (ret == 0) { - btrfs_set_extent_data_ref_root(leaf, ref, - root_objectid); + btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); - btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_extent_data_ref_count(leaf, ref); - num_refs += refs_to_add; + num_refs += node->ref_mod; btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } @@ -705,20 +701,20 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 root_objectid) + struct btrfs_delayed_ref_node *node, + u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; - if (parent) { + if (node->parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; - key.offset = parent; + key.offset = node->parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; - key.offset = root_objectid; + key.offset = node->ref_root; } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -1439,7 +1435,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1462,34 +1458,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * @node: The delayed ref node used to get the bytenr/length for * extent whose references are incremented. * - * @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/ - * BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical - * bytenr of the parent block. Since new extents are always - * created with indirect references, this will only be the case - * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must - * be 0 - * - * @root_objectid: The id of the root where this modification has originated, - * this can be either one of the well-known metadata trees or - * the subvolume id which references this extent. - * - * @owner: For data extents it is the inode number of the owning file. - * For metadata extents this parameter holds the level in the - * tree of the extent. - * - * @offset: For metadata extents the offset is ignored and is currently - * always passed as 0. For data extents it is the fileoffset - * this extent belongs to. - * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, - u64 parent, u64 root_objectid, - u64 owner, u64 offset, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_path *path; @@ -1498,6 +1472,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_key key; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); u64 refs; int refs_to_add = node->ref_mod; int ret; @@ -1508,7 +1484,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, - parent, root_objectid, owner, + node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; @@ -1531,12 +1507,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) - ret = insert_tree_block_ref(trans, path, bytenr, parent, - root_objectid); + ret = insert_tree_block_ref(trans, path, node, bytenr); else - ret = insert_extent_data_ref(trans, path, bytenr, parent, - root_objectid, owner, offset, - refs_to_add); + ret = insert_extent_data_ref(trans, path, node, bytenr); if (ret) btrfs_abort_transaction(trans, ret); @@ -1569,15 +1542,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, bool insert_reserved) { int ret = 0; - struct btrfs_delayed_data_ref *ref; u64 parent = 0; u64 flags = 0; - ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_data_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_DATA_REF_KEY) - parent = ref->parent; + parent = node->parent; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { struct btrfs_key key; @@ -1588,6 +1559,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, .is_inc = true, .generation = trans->transid, }; + u64 owner = btrfs_delayed_ref_owner(node); + u64 offset = btrfs_delayed_ref_offset(node); if (extent_op) flags |= extent_op->flags_to_set; @@ -1596,21 +1569,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = node->num_bytes; - ret = alloc_reserved_file_extent(trans, parent, ref->root, - flags, ref->objectid, - ref->offset, &key, - node->ref_mod, href->owning_root); + ret = alloc_reserved_file_extent(trans, parent, node->ref_root, + flags, owner, offset, &key, + node->ref_mod, + href->owning_root); free_head_ref_squota_rsv(trans->fs_info, href); if (!ret) ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root, - ref->objectid, ref->offset, - extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, href, node, parent, - ref->root, ref->objectid, - ref->offset, extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -1732,16 +1701,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_delayed_tree_ref *ref; u64 parent = 0; u64 ref_root = 0; - ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action); + trace_run_delayed_tree_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) - parent = ref->parent; - ref_root = ref->root; + parent = node->parent; + ref_root = node->ref_root; if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, @@ -1764,11 +1731,9 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, if (!ret) btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, extent_op); + ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, href, node, parent, ref_root, - ref->level, 0, extent_op); + ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } @@ -2292,7 +2257,6 @@ static noinline int check_delayed_ref(struct btrfs_root *root, { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; struct rb_node *node; @@ -2346,6 +2310,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { + u64 ref_owner; + u64 ref_offset; + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { @@ -2353,15 +2320,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root, break; } - data_ref = btrfs_delayed_node_to_data_ref(ref); + ref_owner = btrfs_delayed_ref_owner(ref); + ref_offset = btrfs_delayed_ref_offset(ref); /* * If our ref doesn't match the one we're currently looking at * then we have a cross reference. */ - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || - data_ref->offset != offset) { + if (ref->ref_root != btrfs_root_id(root) || + ref_owner != objectid || ref_offset != offset) { ret = 1; break; } @@ -2454,8 +2421,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || - btrfs_extent_data_ref_root(leaf, ref) != - root->root_key.objectid || + btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || btrfs_extent_data_ref_objectid(leaf, ref) != objectid || btrfs_extent_data_ref_offset(leaf, ref) != offset) goto out; @@ -2492,14 +2458,11 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int full_backref, int inc) { struct btrfs_fs_info *fs_info = root->fs_info; - u64 bytenr; - u64 num_bytes; u64 parent; u64 ref_root; u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct btrfs_ref generic_ref = { 0 }; bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; int action; @@ -2526,6 +2489,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { + struct btrfs_ref ref = { + .action = action, + .parent = parent, + .ref_root = ref_root, + }; + if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) @@ -2535,35 +2504,33 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) + ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (ref.bytenr == 0) continue; - num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + ref.owning_root = ref_root; + key.offset -= btrfs_file_extent_offset(buf, fi); - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent, ref_root); - btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset, root->root_key.objectid, - for_reloc); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } else { - bytenr = btrfs_node_blockptr(buf, i); - num_bytes = fs_info->nodesize; - /* We don't know the owning_root, use 0. */ - btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent, 0); - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, - root->root_key.objectid, for_reloc); + /* We don't know the owning_root, leave as 0. */ + ref.bytenr = btrfs_node_blockptr(buf, i); + ref.num_bytes = fs_info->nodesize; + + btrfs_init_tree_ref(&ref, level - 1, + btrfs_root_id(root), for_reloc); if (inc) - ret = btrfs_inc_extent_ref(trans, &generic_ref); + ret = btrfs_inc_extent_ref(trans, &ref); else - ret = btrfs_free_extent(trans, &generic_ref); + ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } @@ -3099,9 +3066,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *node, u64 parent, - u64 root_objectid, u64 owner_objectid, - u64 owner_offset, + struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -3121,6 +3086,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; + u64 owner_objectid = btrfs_delayed_ref_owner(node); + u64 owner_offset = btrfs_delayed_ref_offset(node); bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); u64 delayed_ref_root = href->owning_root; @@ -3146,7 +3113,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, skinny_metadata = false; ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, - parent, root_objectid, owner_objectid, + node->parent, node->ref_root, owner_objectid, owner_offset); if (ret == 0) { /* @@ -3248,7 +3215,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else if (WARN_ON(ret == -ENOENT)) { abort_and_dump(trans, path, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", - bytenr, parent, root_objectid, owner_objectid, + bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); goto out; } else { @@ -3462,7 +3429,14 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; if (root_id != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_ref generic_ref = { 0 }; + struct btrfs_ref generic_ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = buf->start, + .num_bytes = buf->len, + .parent = parent, + .owning_root = btrfs_header_owner(buf), + .ref_root = root_id, + }; /* * Assert that the extent buffer is not cleared due to @@ -3472,11 +3446,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, */ ASSERT(btrfs_header_bytenr(buf) != 0); - btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent, - btrfs_header_owner(buf)); - btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root_id, 0, false); + btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ @@ -3555,11 +3525,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ - if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { - btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); + if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { + btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); @@ -3567,10 +3534,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ret = btrfs_add_delayed_data_ref(trans, ref, 0); } - if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || - (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) btrfs_ref_tree_mod(fs_info, ref); return ret; @@ -4705,7 +4669,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; - bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); @@ -4899,16 +4863,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); u64 flags = extent_op->flags_to_set; + /* The owner of a tree block is the level. */ + int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); - ref = btrfs_delayed_node_to_tree_ref(node); - extent_key.objectid = node->bytenr; if (skinny_metadata) { - extent_key.offset = ref->level; + /* The owner of a tree block is the level. */ + extent_key.offset = level; extent_key.type = BTRFS_METADATA_ITEM_KEY; } else { extent_key.offset = node->num_bytes; @@ -4941,18 +4905,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); - btrfs_set_tree_block_level(leaf, block_info, ref->level); + btrfs_set_tree_block_level(leaf, block_info, level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); - btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); + btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } btrfs_mark_buffer_dirty(trans, leaf); @@ -4966,19 +4930,20 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_ref generic_ref = { 0 }; - u64 root_objectid = root->root_key.objectid; - u64 owning_root = root_objectid; + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins->objectid, + .num_bytes = ins->offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; - ASSERT(root_objectid != BTRFS_TREE_LOG_OBJECTID); + ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) - owning_root = root->relocation_src_root; + generic_ref.owning_root = root->relocation_src_root; - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0, owning_root); - btrfs_init_data_ref(&generic_ref, root_objectid, owner, - offset, 0, false); + btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); @@ -5101,7 +5066,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, */ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); - __btrfs_tree_lock(buf, nest); + btrfs_tree_lock_nested(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); @@ -5116,7 +5081,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_owner(buf, owner); write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different @@ -5157,7 +5122,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; struct btrfs_delayed_extent_op *extent_op; - struct btrfs_ref generic_ref = { 0 }; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; @@ -5200,6 +5164,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_ref generic_ref = { + .action = BTRFS_ADD_DELAYED_EXTENT, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .parent = parent, + .owning_root = owning_root, + .ref_root = root_objectid, + }; extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) { ret = -ENOMEM; @@ -5214,10 +5186,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->update_flags = true; extent_op->level = level; - btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent, owning_root); - btrfs_init_tree_ref(&generic_ref, level, root_objectid, - root->root_key.objectid, false); + btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) @@ -5355,8 +5324,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; - if (wc->stage == UPDATE_BACKREF && - btrfs_header_owner(eb) != root->root_key.objectid) + if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) return 1; /* @@ -5430,7 +5398,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, - root->root_key.objectid, level, 0); + btrfs_root_id(root), level, 0); btrfs_free_path(path); if (ret == -ENOENT) return 0; @@ -5460,11 +5428,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; - u64 parent; u64 owner_root = 0; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; - struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; int reada = 0; @@ -5488,7 +5454,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, check.level = level - 1; check.transid = generation; - check.owner_root = root->root_key.objectid; + check.owner_root = btrfs_root_id(root); check.has_first_key = true; btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); @@ -5496,7 +5462,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = find_extent_buffer(fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(fs_info, bytenr, - root->root_key.objectid, level - 1); + btrfs_root_id(root), level - 1); if (IS_ERR(next)) return PTR_ERR(next); reada = 1; @@ -5581,19 +5547,25 @@ skip: wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; if (wc->stage == DROP_REFERENCE) { + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = bytenr, + .num_bytes = fs_info->nodesize, + .owning_root = owner_root, + .ref_root = btrfs_root_id(root), + }; if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; + ref.parent = path->nodes[level]->start; } else { - ASSERT(root->root_key.objectid == + ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); - if (root->root_key.objectid != + if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { btrfs_err(root->fs_info, "mismatched block owner"); ret = -EIO; goto out_unlock; } - parent = 0; } /* @@ -5603,7 +5575,7 @@ skip: * ->restarted flag. */ if (wc->restarted) { - ret = check_ref_exists(trans, root, bytenr, parent, + ret = check_ref_exists(trans, root, bytenr, ref.parent, level - 1); if (ret < 0) goto out_unlock; @@ -5618,8 +5590,7 @@ skip: * already accounted them at merge time (replace_path), * thus we could skip expensive subtree trace here. */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - need_account) { + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && need_account) { ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); if (ret) { @@ -5638,10 +5609,7 @@ skip: wc->drop_level = level; find_next_key(path, level, &wc->drop_progress); - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent, owner_root); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, - 0, false); + btrfs_init_tree_ref(&ref, level - 1, 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) goto out_unlock; @@ -5732,7 +5700,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - if (is_fstree(root->root_key.objectid)) { + if (is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, @@ -5752,12 +5720,12 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; - else if (root->root_key.objectid != btrfs_header_owner(eb)) + else if (btrfs_root_id(root) != btrfs_header_owner(eb)) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; - else if (root->root_key.objectid != + else if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level + 1])) goto owner_mismatch; } @@ -5771,7 +5739,7 @@ out: owner_mismatch: btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", - btrfs_header_owner(eb), root->root_key.objectid); + btrfs_header_owner(eb), btrfs_root_id(root)); return -EUCLEAN; } @@ -5857,8 +5825,7 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { - const bool is_reloc_root = (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID); + const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5872,7 +5839,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) bool root_dropped = false; bool unfinished_drop = false; - btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); + btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); path = btrfs_alloc_path(); if (!path) { @@ -6070,8 +6037,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) * * The most common failure here is just -ENOENT. */ - btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); + btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); } } @@ -6133,9 +6099,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, int level; int parent_level; int ret = 0; - int wret; - BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); if (!path) @@ -6169,17 +6134,16 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { - wret = walk_down_tree(trans, root, path, wc); - if (wret < 0) { - ret = wret; + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) break; - } - wret = walk_up_tree(trans, root, path, wc, parent_level); - if (wret < 0) - ret = wret; - if (wret != 0) + ret = walk_up_tree(trans, root, path, wc, parent_level); + if (ret) { + if (ret > 0) + ret = 0; break; + } } kfree(wc); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2776112dbdf8..597387e9f040 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -396,15 +396,14 @@ again: /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, EXTENT_DELALLOC, cached_state); + + unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { - unlock_extent(tree, delalloc_start, delalloc_end, - &cached_state); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); goto again; } - free_extent_state(cached_state); *start = delalloc_start; *end = delalloc_end; out_failed: @@ -413,9 +412,10 @@ out_failed: void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, + struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); + clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, start, end, page_ops); @@ -667,6 +667,37 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* + * Populate every free slot in a provided array with folios. + * + * @nr_folios: number of folios to allocate + * @folio_array: the array to fill with folios; any existing non-NULL entries in + * the array will be skipped + * @extra_gfp: the extra GFP flags for the allocation + * + * Return: 0 if all folios were able to be allocated; + * -ENOMEM otherwise, the partially allocated folios would be freed and + * the array slots zeroed + */ +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array, + gfp_t extra_gfp) +{ + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + continue; + folio_array[i] = folio_alloc(GFP_NOFS | extra_gfp, 0); + if (!folio_array[i]) + goto error; + } + return 0; +error: + for (int i = 0; i < nr_folios; i++) { + if (folio_array[i]) + folio_put(folio_array[i]); + } + return -ENOMEM; +} + +/* * Populate every free slot in a provided array with pages. * * @nr_pages: number of pages to allocate @@ -1571,7 +1602,7 @@ static void set_btree_ioerr(struct extent_buffer *eb) * can be no longer dirty nor marked anymore for writeback (if a * subsequent modification to the extent buffer didn't happen before the * transaction commit), which makes filemap_fdata[write|wait]_range not - * able to find the pages tagged with SetPageError at transaction + * able to find the pages which contain errors at transaction * commit time. So if this happens we must abort the transaction, * otherwise we commit a super block with btree roots that point to * btree nodes/leafs whose content on disk is invalid - either garbage @@ -2246,8 +2277,7 @@ next_page: submit_write_bio(&bio_ctrl, found_error ? ret : 0); } -int extent_writepages(struct address_space *mapping, - struct writeback_control *wbc) +int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; int ret = 0; @@ -2267,7 +2297,7 @@ int extent_writepages(struct address_space *mapping, return ret; } -void extent_readahead(struct readahead_control *rac) +void btrfs_readahead(struct readahead_control *rac) { struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; struct page *pagepool[16]; @@ -2325,19 +2355,20 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -static int try_release_extent_state(struct extent_io_tree *tree, +static bool try_release_extent_state(struct extent_io_tree *tree, struct page *page, gfp_t mask) { u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - int ret = 1; + bool ret; if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { - ret = 0; + ret = false; } else { u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED); + int ret2; /* * At this point we can safely clear everything except the @@ -2345,15 +2376,15 @@ static int try_release_extent_state(struct extent_io_tree *tree, * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. */ - if (ret < 0) - ret = 0; + if (ret2 < 0) + ret = false; else - ret = 1; + ret = true; } return ret; } @@ -2363,84 +2394,80 @@ static int try_release_extent_state(struct extent_io_tree *tree, * in the range corresponding to the page, both state records and extent * map records are removed */ -int try_release_extent_mapping(struct page *page, gfp_t mask) +bool try_release_extent_mapping(struct page *page, gfp_t mask) { - struct extent_map *em; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - struct btrfs_inode *btrfs_inode = page_to_inode(page); - struct extent_io_tree *tree = &btrfs_inode->io_tree; - struct extent_map_tree *map = &btrfs_inode->extent_tree; - - if (gfpflags_allow_blocking(mask) && - page->mapping->host->i_size > SZ_16M) { - u64 len; - while (start <= end) { - struct btrfs_fs_info *fs_info; - u64 cur_gen; - - len = end - start + 1; - write_lock(&map->lock); - em = lookup_extent_mapping(map, start, len); - if (!em) { - write_unlock(&map->lock); - break; - } - if ((em->flags & EXTENT_FLAG_PINNED) || - em->start != start) { - write_unlock(&map->lock); - free_extent_map(em); - break; - } - if (test_range_bit_exists(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED)) - goto next; - /* - * If it's not in the list of modified extents, used - * by a fast fsync, we can remove it. If it's being - * logged we can safely remove it since fsync took an - * extra reference on the em. - */ - if (list_empty(&em->list) || - (em->flags & EXTENT_FLAG_LOGGING)) - goto remove_em; - /* - * If it's in the list of modified extents, remove it - * only if its generation is older then the current one, - * in which case we don't need it for a fast fsync. - * Otherwise don't remove it, we could be racing with an - * ongoing fast fsync that could miss the new extent. - */ - fs_info = btrfs_inode->root->fs_info; - spin_lock(&fs_info->trans_lock); - cur_gen = fs_info->generation; - spin_unlock(&fs_info->trans_lock); - if (em->generation >= cur_gen) - goto next; -remove_em: - /* - * We only remove extent maps that are not in the list of - * modified extents or that are in the list but with a - * generation lower then the current generation, so there - * is no need to set the full fsync flag on the inode (it - * hurts the fsync performance for workloads with a data - * size that exceeds or is close to the system's memory). - */ - remove_extent_mapping(map, em); - /* once for the rb tree */ + struct btrfs_inode *inode = page_to_inode(page); + struct extent_io_tree *io_tree = &inode->io_tree; + + while (start <= end) { + const u64 cur_gen = btrfs_get_fs_generation(inode->root->fs_info); + const u64 len = end - start + 1; + struct extent_map_tree *extent_tree = &inode->extent_tree; + struct extent_map *em; + + write_lock(&extent_tree->lock); + em = lookup_extent_mapping(extent_tree, start, len); + if (!em) { + write_unlock(&extent_tree->lock); + break; + } + if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { + write_unlock(&extent_tree->lock); free_extent_map(em); + break; + } + if (test_range_bit_exists(io_tree, em->start, + extent_map_end(em) - 1, EXTENT_LOCKED)) + goto next; + /* + * If it's not in the list of modified extents, used by a fast + * fsync, we can remove it. If it's being logged we can safely + * remove it since fsync took an extra reference on the em. + */ + if (list_empty(&em->list) || (em->flags & EXTENT_FLAG_LOGGING)) + goto remove_em; + /* + * If it's in the list of modified extents, remove it only if + * its generation is older then the current one, in which case + * we don't need it for a fast fsync. Otherwise don't remove it, + * we could be racing with an ongoing fast fsync that could miss + * the new extent. + */ + if (em->generation >= cur_gen) + goto next; +remove_em: + /* + * We only remove extent maps that are not in the list of + * modified extents or that are in the list but with a + * generation lower then the current generation, so there is no + * need to set the full fsync flag on the inode (it hurts the + * fsync performance for workloads with a data size that exceeds + * or is close to the system's memory). + */ + remove_extent_mapping(inode, em); + /* Once for the inode's extent map tree. */ + free_extent_map(em); next: - start = extent_map_end(em); - write_unlock(&map->lock); + start = extent_map_end(em); + write_unlock(&extent_tree->lock); - /* once for us */ - free_extent_map(em); + /* Once for us, for the lookup_extent_mapping() reference. */ + free_extent_map(em); + + if (need_resched()) { + /* + * If we need to resched but we can't block just exit + * and leave any remaining extent maps. + */ + if (!gfpflags_allow_blocking(mask)) + break; - cond_resched(); /* Allow large-extent preemption. */ + cond_resched(); } } - return try_release_extent_state(tree, page, mask); + return try_release_extent_state(io_tree, page, mask); } struct btrfs_fiemap_entry { @@ -2773,13 +2800,19 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p goto out; } - /* See the comment at fiemap_search_slot() about why we clone. */ - copy_extent_buffer_full(clone, path->nodes[0]); /* * Important to preserve the start field, for the optimizations when * checking if extents are shared (see extent_fiemap()). + * + * We must set ->start before calling copy_extent_buffer_full(). If we + * are on sub-pagesize blocksize, we use ->start to determine the offset + * into the folio where our eb exists, and if we update ->start after + * the fact then any subsequent reads of the eb may read from a + * different offset in the folio than where we originally copied into. */ clone->start = path->nodes[0]->start; + /* See the comment at fiemap_search_slot() about why we clone. */ + copy_extent_buffer_full(clone, path->nodes[0]); slot = path->slots[0]; btrfs_release_path(path); @@ -4261,6 +4294,13 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } +static void clear_extent_buffer_reading(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); +} + static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; @@ -4269,6 +4309,13 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) struct folio_iter fi; u32 bio_offset = 0; + /* + * If the extent buffer is marked UPTODATE before the read operation + * completes, other calls to read_extent_buffer_pages() will return + * early without waiting for the read to finish, causing data races. + */ + WARN_ON(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)); + eb->read_mirror = bbio->mirror_num; if (uptodate && @@ -4295,9 +4342,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) bio_offset += len; } - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_extent_buffer_reading(eb); free_extent_buffer(eb); bio_put(&bbio->bio); @@ -4331,9 +4376,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, * will now be set, and we shouldn't read it in again. */ if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { - clear_bit(EXTENT_BUFFER_READING, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + clear_extent_buffer_reading(eb); return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e3530d427e1f..dca6b12769ec 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -27,6 +27,7 @@ struct address_space; struct writeback_control; struct extent_io_tree; struct extent_map_tree; +struct extent_state; struct btrfs_block_group; struct btrfs_fs_info; struct btrfs_inode; @@ -230,18 +231,17 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -int try_release_extent_mapping(struct page *page, gfp_t mask); +bool try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); void extent_write_locked_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); -int extent_writepages(struct address_space *mapping, - struct writeback_control *wbc); +int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -void extent_readahead(struct readahead_control *rac); +void btrfs_readahead(struct readahead_control *rac); int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); int set_folio_extent_mapped(struct folio *folio); @@ -353,6 +353,7 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, + struct extent_state **cached, u32 bits_to_clear, unsigned long page_ops); int extent_invalidate_folio(struct extent_io_tree *tree, struct folio *folio, size_t offset); @@ -361,6 +362,8 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, gfp_t extra_gfp); +int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array, + gfp_t extra_gfp); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 24a048210b15..744e8952abb0 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -8,6 +8,7 @@ #include "extent_map.h" #include "compression.h" #include "btrfs_inode.h" +#include "disk-io.h" static struct kmem_cache *extent_map_cache; @@ -76,6 +77,14 @@ static u64 range_end(u64 start, u64 len) return start + len; } +static void dec_evictable_extent_maps(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root))) + percpu_counter_dec(&fs_info->evictable_extent_maps); +} + static int tree_insert(struct rb_root_cached *root, struct extent_map *em) { struct rb_node **p = &root->rb_root.rb_node; @@ -223,8 +232,9 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma return next->block_start == prev->block_start; } -static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) +static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) { + struct extent_map_tree *tree = &inode->extent_tree; struct extent_map *merge = NULL; struct rb_node *rb; @@ -252,14 +262,13 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->len += merge->len; em->block_len += merge->block_len; em->block_start = merge->block_start; - em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; - em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); free_extent_map(merge); + dec_evictable_extent_maps(inode); } } @@ -271,10 +280,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->block_len += merge->block_len; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); - em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; free_extent_map(merge); + dec_evictable_extent_maps(inode); } } @@ -300,7 +309,6 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; struct extent_map *em; - bool prealloc = false; write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); @@ -325,20 +333,8 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) em->generation = gen; em->flags &= ~EXTENT_FLAG_PINNED; - em->mod_start = em->start; - em->mod_len = em->len; - if (em->flags & EXTENT_FLAG_FILLING) { - prealloc = true; - em->flags &= ~EXTENT_FLAG_FILLING; - } - - try_merge_map(tree, em); - - if (prealloc) { - em->mod_start = em->start; - em->mod_len = em->len; - } + try_merge_map(inode, em); out: write_unlock(&tree->lock); @@ -347,58 +343,62 @@ out: } -void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { - lockdep_assert_held_write(&tree->lock); + lockdep_assert_held_write(&inode->extent_tree.lock); em->flags &= ~EXTENT_FLAG_LOGGING; if (extent_map_in_tree(em)) - try_merge_map(tree, em); + try_merge_map(inode, em); } -static inline void setup_extent_mapping(struct extent_map_tree *tree, +static inline void setup_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, int modified) { refcount_inc(&em->refs); - em->mod_start = em->start; - em->mod_len = em->len; ASSERT(list_empty(&em->list)); if (modified) - list_add(&em->list, &tree->modified_extents); + list_add(&em->list, &inode->extent_tree.modified_extents); else - try_merge_map(tree, em); + try_merge_map(inode, em); } /* - * Add new extent map to the extent tree + * Add a new extent map to an inode's extent map tree. * - * @tree: tree to insert new map in + * @inode: the target inode * @em: map to insert * @modified: indicate whether the given @em should be added to the * modified list, which indicates the extent needs to be logged * - * Insert @em into @tree or perform a simple forward/backward merge with - * existing mappings. The extent_map struct passed in will be inserted - * into the tree directly, with an additional reference taken, or a - * reference dropped if the merge attempt was successful. + * Insert @em into the @inode's extent map tree or perform a simple + * forward/backward merge with existing mappings. The extent_map struct passed + * in will be inserted into the tree directly, with an additional reference + * taken, or a reference dropped if the merge attempt was successful. */ -static int add_extent_mapping(struct extent_map_tree *tree, +static int add_extent_mapping(struct btrfs_inode *inode, struct extent_map *em, int modified) { - int ret = 0; + struct extent_map_tree *tree = &inode->extent_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; lockdep_assert_held_write(&tree->lock); ret = tree_insert(&tree->map, em); if (ret) - goto out; + return ret; - setup_extent_mapping(tree, em, modified); -out: - return ret; + setup_extent_mapping(inode, em, modified); + + if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root))) + percpu_counter_inc(&fs_info->evictable_extent_maps); + + return 0; } static struct extent_map * @@ -464,16 +464,18 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, } /* - * Remove an extent_map from the extent tree. + * Remove an extent_map from its inode's extent tree. * - * @tree: extent tree to remove from + * @inode: the inode the extent map belongs to * @em: extent map being removed * - * Remove @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use. + * Remove @em from the extent tree of @inode. No reference counts are dropped, + * and no checks are done to see if the range is in use. */ -void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { + struct extent_map_tree *tree = &inode->extent_tree; + lockdep_assert_held_write(&tree->lock); WARN_ON(em->flags & EXTENT_FLAG_PINNED); @@ -481,13 +483,17 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); RB_CLEAR_NODE(&em->rb_node); + + dec_evictable_extent_maps(inode); } -static void replace_extent_mapping(struct extent_map_tree *tree, +static void replace_extent_mapping(struct btrfs_inode *inode, struct extent_map *cur, struct extent_map *new, int modified) { + struct extent_map_tree *tree = &inode->extent_tree; + lockdep_assert_held_write(&tree->lock); WARN_ON(cur->flags & EXTENT_FLAG_PINNED); @@ -497,7 +503,7 @@ static void replace_extent_mapping(struct extent_map_tree *tree, rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); RB_CLEAR_NODE(&cur->rb_node); - setup_extent_mapping(tree, new, modified); + setup_extent_mapping(inode, new, modified); } static struct extent_map *next_extent_map(const struct extent_map *em) @@ -526,7 +532,7 @@ static struct extent_map *prev_extent_map(struct extent_map *em) * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. */ -static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, +static noinline int merge_extent_mapping(struct btrfs_inode *inode, struct extent_map *existing, struct extent_map *em, u64 map_start) @@ -560,14 +566,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, em->block_start += start_diff; em->block_len = em->len; } - return add_extent_mapping(em_tree, em, 0); + return add_extent_mapping(inode, em, 0); } /* - * Add extent mapping into em_tree. + * Add extent mapping into an inode's extent map tree. * - * @fs_info: the filesystem - * @em_tree: extent tree into which we want to insert the extent mapping + * @inode: target inode * @em_in: extent we are inserting * @start: start of the logical range btrfs_get_extent() is requesting * @len: length of the logical range btrfs_get_extent() is requesting @@ -575,8 +580,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. * - * Insert @em_in into @em_tree. In case there is an overlapping range, handle - * the -EEXIST by either: + * Insert @em_in into the inode's extent map tree. In case there is an + * overlapping range, handle the -EEXIST by either: * a) Returning the existing extent in @em_in if @start is within the * existing em. * b) Merge the existing extent with @em_in passed in. @@ -584,12 +589,12 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, * Return 0 on success, otherwise -EEXIST. * */ -int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len) { int ret; struct extent_map *em = *em_in; + struct btrfs_fs_info *fs_info = inode->root->fs_info; /* * Tree-checker should have rejected any inline extent with non-zero @@ -598,7 +603,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (em->block_start == EXTENT_MAP_INLINE) ASSERT(em->start == 0); - ret = add_extent_mapping(em_tree, em, 0); + ret = add_extent_mapping(inode, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -606,7 +611,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (ret == -EEXIST) { struct extent_map *existing; - existing = search_extent_mapping(em_tree, start, len); + existing = search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -627,8 +632,7 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, * The existing extent map is the one nearest to * the [start, start + len) range which overlaps */ - ret = merge_extent_mapping(em_tree, existing, - em, start); + ret = merge_extent_mapping(inode, existing, em, start); if (WARN_ON(ret)) { free_extent_map(em); *em_in = NULL; @@ -650,8 +654,10 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, * if needed. This avoids searching the tree, from the root down to the first * extent map, before each deletion. */ -static void drop_all_extent_maps_fast(struct extent_map_tree *tree) +static void drop_all_extent_maps_fast(struct btrfs_inode *inode) { + struct extent_map_tree *tree = &inode->extent_tree; + write_lock(&tree->lock); while (!RB_EMPTY_ROOT(&tree->map.rb_root)) { struct extent_map *em; @@ -660,7 +666,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree) node = rb_first_cached(&tree->map); em = rb_entry(node, struct extent_map, rb_node); em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); - remove_extent_mapping(tree, em); + remove_extent_mapping(inode, em); free_extent_map(em); cond_resched_rwlock_write(&tree->lock); } @@ -693,7 +699,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, WARN_ON(end < start); if (end == (u64)-1) { if (start == 0 && !skip_pinned) { - drop_all_extent_maps_fast(em_tree); + drop_all_extent_maps_fast(inode); return; } len = (u64)-1; @@ -790,7 +796,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; - replace_extent_mapping(em_tree, em, split, modified); + replace_extent_mapping(inode, em, split, modified); free_extent_map(split); split = split2; split2 = NULL; @@ -831,13 +837,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, } if (extent_map_in_tree(em)) { - replace_extent_mapping(em_tree, em, split, - modified); + replace_extent_mapping(inode, em, split, modified); } else { int ret; - ret = add_extent_mapping(em_tree, split, - modified); + ret = add_extent_mapping(inode, split, modified); /* Logic error, shouldn't happen. */ ASSERT(ret == 0); if (WARN_ON(ret != 0) && modified) @@ -872,7 +876,7 @@ remove_em: ASSERT(!split); btrfs_set_inode_full_sync(inode); } - remove_extent_mapping(em_tree, em); + remove_extent_mapping(inode, em); } /* @@ -927,7 +931,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, do { btrfs_drop_extent_map_range(inode, new_em->start, end, false); write_lock(&tree->lock); - ret = add_extent_mapping(tree, new_em, modified); + ret = add_extent_mapping(inode, new_em, modified); write_unlock(&tree->lock); } while (ret == -EEXIST); @@ -991,7 +995,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_pre->flags = flags; split_pre->generation = em->generation; - replace_extent_mapping(em_tree, em, split_pre, 1); + replace_extent_mapping(inode, em, split_pre, 1); /* * Now we only have an extent_map at: @@ -1008,7 +1012,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); + add_extent_mapping(inode, split_mid, 1); /* Once for us */ free_extent_map(em); @@ -1023,3 +1027,175 @@ out_free_pre: free_extent_map(split_pre); return ret; } + +static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan) +{ + const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info); + struct extent_map_tree *tree = &inode->extent_tree; + long nr_dropped = 0; + struct rb_node *node; + + /* + * Take the mmap lock so that we serialize with the inode logging phase + * of fsync because we may need to set the full sync flag on the inode, + * in case we have to remove extent maps in the tree's list of modified + * extents. If we set the full sync flag in the inode while an fsync is + * in progress, we may risk missing new extents because before the flag + * is set, fsync decides to only wait for writeback to complete and then + * during inode logging it sees the flag set and uses the subvolume tree + * to find new extents, which may not be there yet because ordered + * extents haven't completed yet. + * + * We also do a try lock because otherwise we could deadlock. This is + * because the shrinker for this filesystem may be invoked while we are + * in a path that is holding the mmap lock in write mode. For example in + * a reflink operation while COWing an extent buffer, when allocating + * pages for a new extent buffer and under memory pressure, the shrinker + * may be invoked, and therefore we would deadlock by attempting to read + * lock the mmap lock while we are holding already a write lock on it. + */ + if (!down_read_trylock(&inode->i_mmap_lock)) + return 0; + + write_lock(&tree->lock); + node = rb_first_cached(&tree->map); + while (node) { + struct extent_map *em; + + em = rb_entry(node, struct extent_map, rb_node); + node = rb_next(node); + (*scanned)++; + + if (em->flags & EXTENT_FLAG_PINNED) + goto next; + + /* + * If the inode is in the list of modified extents (new) and its + * generation is the same (or is greater than) the current fs + * generation, it means it was not yet persisted so we have to + * set the full sync flag so that the next fsync will not miss + * it. + */ + if (!list_empty(&em->list) && em->generation >= cur_fs_gen) + btrfs_set_inode_full_sync(inode); + + remove_extent_mapping(inode, em); + trace_btrfs_extent_map_shrinker_remove_em(inode, em); + /* Drop the reference for the tree. */ + free_extent_map(em); + nr_dropped++; +next: + if (*scanned >= nr_to_scan) + break; + + /* + * Restart if we had to reschedule, and any extent maps that were + * pinned before may have become unpinned after we released the + * lock and took it again. + */ + if (cond_resched_rwlock_write(&tree->lock)) + node = rb_first_cached(&tree->map); + } + write_unlock(&tree->lock); + up_read(&inode->i_mmap_lock); + + return nr_dropped; +} + +static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_inode *inode; + long nr_dropped = 0; + u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1; + + inode = btrfs_find_first_inode(root, min_ino); + while (inode) { + nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan); + + min_ino = btrfs_ino(inode) + 1; + fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode); + iput(&inode->vfs_inode); + + if (*scanned >= nr_to_scan) + break; + + cond_resched(); + inode = btrfs_find_first_inode(root, min_ino); + } + + if (inode) { + /* + * There are still inodes in this root or we happened to process + * the last one and reached the scan limit. In either case set + * the current root to this one, so we'll resume from the next + * inode if there is one or we will find out this was the last + * one and move to the next root. + */ + fs_info->extent_map_shrinker_last_root = btrfs_root_id(root); + } else { + /* + * No more inodes in this root, set extent_map_shrinker_last_ino to 0 so + * that when processing the next root we start from its first inode. + */ + fs_info->extent_map_shrinker_last_ino = 0; + fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1; + } + + return nr_dropped; +} + +long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan) +{ + const u64 start_root_id = fs_info->extent_map_shrinker_last_root; + u64 next_root_id = start_root_id; + bool cycled = false; + long nr_dropped = 0; + long scanned = 0; + + if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr); + } + + while (scanned < nr_to_scan) { + struct btrfs_root *root; + unsigned long count; + + spin_lock(&fs_info->fs_roots_radix_lock); + count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)&root, + (unsigned long)next_root_id, 1); + if (count == 0) { + spin_unlock(&fs_info->fs_roots_radix_lock); + if (start_root_id > 0 && !cycled) { + next_root_id = 0; + fs_info->extent_map_shrinker_last_root = 0; + fs_info->extent_map_shrinker_last_ino = 0; + cycled = true; + continue; + } + break; + } + next_root_id = btrfs_root_id(root) + 1; + root = btrfs_grab_root(root); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (!root) + continue; + + if (is_fstree(btrfs_root_id(root))) + nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan); + + btrfs_put_root(root); + } + + if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) { + s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr); + } + + return nr_dropped; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index c5a098c99cc6..6d587111f73a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -30,28 +30,77 @@ enum { ENUM_BIT(EXTENT_FLAG_PREALLOC), /* Logging this extent */ ENUM_BIT(EXTENT_FLAG_LOGGING), - /* Filling in a preallocated extent */ - ENUM_BIT(EXTENT_FLAG_FILLING), /* This em is merged from two or more physically adjacent ems */ ENUM_BIT(EXTENT_FLAG_MERGED), }; /* + * This structure represents file extents and holes. + * + * Unlike on-disk file extent items, extent maps can be merged to save memory. + * This means members only match file extent items before any merging. + * * Keep this structure as compact as possible, as we can have really large * amounts of allocated extent maps at any time. */ struct extent_map { struct rb_node rb_node; - /* all of these are in bytes */ + /* All of these are in bytes. */ + + /* File offset matching the offset of a BTRFS_EXTENT_ITEM_KEY key. */ u64 start; + + /* + * Length of the file extent. + * + * For non-inlined file extents it's btrfs_file_extent_item::num_bytes. + * For inline extents it's sectorsize, since inline data starts at + * offsetof(struct btrfs_file_extent_item, disk_bytenr) thus + * btrfs_file_extent_item::num_bytes is not valid. + */ u64 len; - u64 mod_start; - u64 mod_len; + + /* + * The file offset of the original file extent before splitting. + * + * This is an in-memory only member, matching + * extent_map::start - btrfs_file_extent_item::offset for + * regular/preallocated extents. EXTENT_MAP_HOLE otherwise. + */ u64 orig_start; + + /* + * The full on-disk extent length, matching + * btrfs_file_extent_item::disk_num_bytes. + */ u64 orig_block_len; + + /* + * The decompressed size of the whole on-disk extent, matching + * btrfs_file_extent_item::ram_bytes. + */ u64 ram_bytes; + + /* + * The on-disk logical bytenr for the file extent. + * + * For compressed extents it matches btrfs_file_extent_item::disk_bytenr. + * For uncompressed extents it matches + * btrfs_file_extent_item::disk_bytenr + btrfs_file_extent_item::offset + * + * For holes it is EXTENT_MAP_HOLE and for inline extents it is + * EXTENT_MAP_INLINE. + */ u64 block_start; + + /* + * The on-disk length for the file extent. + * + * For compressed extents it matches btrfs_file_extent_item::disk_num_bytes. + * For uncompressed extents it matches extent_map::len. + * For holes and inline extents it's -1 and shouldn't be used. + */ u64 block_len; /* @@ -124,7 +173,7 @@ static inline u64 extent_map_end(const struct extent_map *em) void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); +void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, u64 new_logical); @@ -133,11 +182,10 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void __cold extent_map_exit(void); int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); -void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); +void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len); void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, @@ -145,5 +193,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map *new_em, bool modified); +long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index e58fb5347e65..bce95f871750 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -430,8 +430,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) memset(csum_dst, 0, csum_size); count = 1; - if (inode->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; set_extent_bit(&inode->io_tree, file_offset, @@ -450,9 +449,22 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) return ret; } +/* + * Search for checksums for a given logical range. + * + * @root: The root where to look for checksums. + * @start: Logical address of target checksum range. + * @end: End offset (inclusive) of the target checksum range. + * @list: List for adding each checksum that was found. + * Can be NULL in case the caller only wants to check if + * there any checksums for the range. + * @nowait: Indicate if the search must be non-blocking or not. + * + * Return < 0 on error, 0 if no checksums were found, or 1 if checksums were + * found. + */ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait) + struct list_head *list, bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -460,8 +472,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct extent_buffer *leaf; struct btrfs_ordered_sum *sums; struct btrfs_csum_item *item; - LIST_HEAD(tmplist); int ret; + bool found_csums = false; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -471,11 +483,6 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, return -ENOMEM; path->nowait = nowait; - if (search_commit) { - path->skip_locking = 1; - path->reada = READA_FORWARD; - path->search_commit_root = 1; - } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key.offset = start; @@ -483,7 +490,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto fail; + goto out; if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); @@ -518,7 +525,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto fail; + goto out; if (ret > 0) break; leaf = path->nodes[0]; @@ -540,6 +547,10 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, continue; } + found_csums = true; + if (!list) + goto out; + csum_end = min(csum_end, end + 1); item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); @@ -553,7 +564,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, GFP_NOFS); if (!sums) { ret = -ENOMEM; - goto fail; + goto out; } sums->logical = start; @@ -567,21 +578,24 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, bytes_to_csum_size(fs_info, size)); start += size; - list_add_tail(&sums->list, &tmplist); + list_add_tail(&sums->list, list); } path->slots[0]++; } - ret = 0; -fail: - while (ret < 0 && !list_empty(&tmplist)) { - sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); +out: + btrfs_free_path(path); + if (ret < 0) { + if (list) { + struct btrfs_ordered_sum *tmp_sums; + + list_for_each_entry_safe(sums, tmp_sums, list, list) + kfree(sums); + } + + return ret; } - list_splice_tail(&tmplist, list); - btrfs_free_path(path); - return ret; + return found_csums ? 1 : 0; } /* @@ -870,8 +884,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; - ASSERT(root->root_key.objectid == BTRFS_CSUM_TREE_OBJECTID || - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) == BTRFS_CSUM_TREE_OBJECTID || + btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); path = btrfs_alloc_path(); if (!path) @@ -1171,7 +1185,7 @@ extend_csum: * search, etc, because log trees are temporary anyway and it * would only save a few bytes of leaf space. */ - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { if (path->slots[0] + 1 >= btrfs_header_nritems(path->nodes[0])) { ret = find_next_csum_offset(root, path, &next_offset); @@ -1265,20 +1279,19 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; struct btrfs_key key; - u64 extent_start, extent_end; + u64 extent_start; u64 bytenr; u8 type = btrfs_file_extent_type(leaf, fi); int compress_type = btrfs_file_extent_compression(leaf, fi); btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; - extent_end = btrfs_file_extent_end(path); em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); em->generation = btrfs_file_extent_generation(leaf, fi); if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { em->start = extent_start; - em->len = extent_end - extent_start; + em->len = btrfs_file_extent_end(path) - extent_start; em->orig_start = extent_start - btrfs_file_extent_offset(leaf, fi); em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); @@ -1299,9 +1312,12 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->flags |= EXTENT_FLAG_PREALLOC; } } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* Tree-checker has ensured this. */ + ASSERT(extent_start == 0); + em->block_start = EXTENT_MAP_INLINE; - em->start = extent_start; - em->len = extent_end - extent_start; + em->start = 0; + em->len = fs_info->sectorsize; /* * Initialize orig_start and block_len with the same values * as in inode.c:btrfs_get_extent(). @@ -1313,7 +1329,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " "root %llu", type, btrfs_ino(inode), extent_start, - root->root_key.objectid); + btrfs_root_id(root)); } } @@ -1334,12 +1350,10 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path) ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { - end = btrfs_file_extent_ram_bytes(leaf, fi); - end = ALIGN(key.offset + end, leaf->fs_info->sectorsize); - } else { + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) + end = leaf->fs_info->sectorsize; + else end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - } return end; } diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 15c05cc0fce6..557dc43d7142 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -68,8 +68,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); + struct list_head *list, bool nowait); int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, u64 start, u64 end, u8 *csum_buf, unsigned long *csum_bitmap); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1640c46f2153..e764ac3f22e2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -128,7 +128,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int err = 0; + int ret = 0; int i; u64 num_bytes; u64 start_pos; @@ -158,10 +158,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached); - err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); - if (err) - return err; + if (ret) + return ret; for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -206,7 +206,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; - struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); @@ -246,7 +245,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; - update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -373,15 +372,17 @@ next_slot: btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, - disk_bytenr, num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - new_key.objectid, - args->start - extent_offset, - 0, false); + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, new_key.objectid, + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -464,15 +465,17 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - btrfs_init_generic_ref(&ref, - BTRFS_DROP_DELAYED_REF, - disk_bytenr, num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - key.objectid, - key.offset - extent_offset, 0, - false); + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = disk_bytenr, + .num_bytes = num_bytes, + .parent = 0, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, key.objectid, + key.offset - extent_offset, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -748,10 +751,13 @@ again: extent_end - split); btrfs_mark_buffer_dirty(trans, leaf); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, - num_bytes, 0, root->root_key.objectid); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset, 0, false); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -774,10 +780,14 @@ again: other_start = end; other_end = 0; - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, 0, root->root_key.objectid); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, - 0, false); + + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = 0; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_root_id(root); + btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -915,7 +925,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); fgf_t fgp_flags = get_prepare_fgp_flags(nowait); - int err = 0; + int ret = 0; int faili; for (i = 0; i < num_pages; i++) { @@ -925,28 +935,28 @@ again: if (!pages[i]) { faili = i - 1; if (nowait) - err = -EAGAIN; + ret = -EAGAIN; else - err = -ENOMEM; + ret = -ENOMEM; goto fail; } - err = set_page_extent_mapped(pages[i]); - if (err < 0) { + ret = set_page_extent_mapped(pages[i]); + if (ret < 0) { faili = i; goto fail; } if (i == 0) - err = prepare_uptodate_page(inode, pages[i], pos, + ret = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); - if (!err && i == num_pages - 1) - err = prepare_uptodate_page(inode, pages[i], + if (!ret && i == num_pages - 1) + ret = prepare_uptodate_page(inode, pages[i], pos + write_bytes, false); - if (err) { + if (ret) { put_page(pages[i]); - if (!nowait && err == -EAGAIN) { - err = 0; + if (!nowait && ret == -EAGAIN) { + ret = 0; goto again; } faili = i - 1; @@ -962,7 +972,7 @@ fail: put_page(pages[faili]); faili--; } - return err; + return ret; } @@ -1465,7 +1475,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ssize_t written_buffered; size_t prev_left = 0; loff_t endbyte; - ssize_t err; + ssize_t ret; unsigned int ilock_flags = 0; struct iomap_dio *dio; @@ -1482,9 +1492,9 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ilock_flags |= BTRFS_ILOCK_SHARED; relock: - err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (err < 0) - return err; + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; /* Shared lock cannot be used with security bits set. */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { @@ -1493,14 +1503,14 @@ relock: goto relock; } - err = generic_write_checks(iocb, from); - if (err <= 0) { + ret = generic_write_checks(iocb, from); + if (ret <= 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); - return err; + return ret; } - err = btrfs_write_check(iocb, from, err); - if (err < 0) { + ret = btrfs_write_check(iocb, from, ret); + if (ret < 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; } @@ -1552,15 +1562,15 @@ relock: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); if (IS_ERR_OR_NULL(dio)) - err = PTR_ERR_OR_ZERO(dio); + ret = PTR_ERR_OR_ZERO(dio); else - err = iomap_dio_complete(dio); + ret = iomap_dio_complete(dio); /* No increment (+=) because iomap returns a cumulative value. */ - if (err > 0) - written = err; + if (ret > 0) + written = ret; - if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { + if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) { const size_t left = iov_iter_count(from); /* * We have more data left to write. Try to fault in as many as @@ -1577,7 +1587,7 @@ relock: * to buffered IO in case we haven't made any progress. */ if (left == prev_left) { - err = -ENOTBLK; + ret = -ENOTBLK; } else { fault_in_iov_iter_readable(from, left); prev_left = left; @@ -1586,10 +1596,10 @@ relock: } /* - * If 'err' is -ENOTBLK or we have not written all data, then it means + * If 'ret' is -ENOTBLK or we have not written all data, then it means * we must fallback to buffered IO. */ - if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) + if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from)) goto out; buffered: @@ -1600,14 +1610,14 @@ buffered: * below, we will block when flushing and waiting for the IO. */ if (iocb->ki_flags & IOCB_NOWAIT) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { - err = written_buffered; + ret = written_buffered; goto out; } /* @@ -1615,18 +1625,18 @@ buffered: * able to read what was just written. */ endbyte = pos + written_buffered - 1; - err = btrfs_fdatawrite_range(inode, pos, endbyte); - if (err) + ret = btrfs_fdatawrite_range(inode, pos, endbyte); + if (ret) goto out; - err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); - if (err) + ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); + if (ret) goto out; written += written_buffered; iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); out: - return err < 0 ? err : written; + return ret < 0 ? ret : written; } static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, @@ -2029,6 +2039,172 @@ out_release_extents: goto out; } +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * truncate_setsize() writes the inode size before removing pages, once we have + * the page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct folio *folio = page_folio(page); + struct inode *inode = file_inode(vmf->vma->vm_file); + struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + struct extent_changeset *data_reserved = NULL; + unsigned long zero_start; + loff_t size; + vm_fault_t ret; + int ret2; + int reserved = 0; + u64 reserved_space; + u64 page_start; + u64 page_end; + u64 end; + + ASSERT(folio_order(folio) == 0); + + reserved_space = PAGE_SIZE; + + sb_start_pagefault(inode->i_sb); + page_start = page_offset(page); + page_end = page_start + PAGE_SIZE - 1; + end = page_end; + + /* + * Reserving delalloc space after obtaining the page lock can lead to + * deadlock. For example, if a dirty page is locked by this function + * and the call to btrfs_delalloc_reserve_space() ends up triggering + * dirty page write out, then the btrfs_writepages() function could + * end up waiting indefinitely to get a lock on the page currently + * being processed by btrfs_page_mkwrite() function. + */ + ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (!ret2) { + ret2 = file_update_time(vmf->vma->vm_file); + reserved = 1; + } + if (ret2) { + ret = vmf_error(ret2); + if (reserved) + goto out; + goto out_noreserve; + } + + /* Make the VM retry the fault. */ + ret = VM_FAULT_NOPAGE; +again: + down_read(&BTRFS_I(inode)->i_mmap_lock); + lock_page(page); + size = i_size_read(inode); + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + /* Page got truncated out from underneath us. */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, &cached_state); + ret2 = set_page_extent_mapped(page); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } + + /* + * We can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish. + */ + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); + btrfs_start_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + goto again; + } + + if (page->index == ((size - 1) >> PAGE_SHIFT)) { + reserved_space = round_up(size - page_start, fs_info->sectorsize); + if (reserved_space < PAGE_SIZE) { + end = page_start + reserved_space - 1; + btrfs_delalloc_release_space(BTRFS_I(inode), + data_reserved, page_start, + PAGE_SIZE - reserved_space, true); + } + } + + /* + * page_mkwrite gets called when the page is firstly dirtied after it's + * faulted in, but write(2) could also dirty a page and set delalloc + * bits, thus in this case for space account reason, we still need to + * clear any delalloc bits within this page range since we have to + * reserve data&meta space before lock_page() (see above comments). + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); + + ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + &cached_state); + if (ret2) { + unlock_extent(io_tree, page_start, page_end, &cached_state); + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } + + /* Page is wholly or partially inside EOF. */ + if (page_start + PAGE_SIZE > size) + zero_start = offset_in_page(size); + else + zero_start = PAGE_SIZE; + + if (zero_start != PAGE_SIZE) + memzero_page(page, zero_start, PAGE_SIZE - zero_start); + + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); + + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); + + unlock_extent(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); + + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; + +out_unlock: + unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); +out: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, + reserved_space, (ret != 0)); +out_noreserve: + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return ret; +} + static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -2258,7 +2434,6 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; int slot; - struct btrfs_ref ref = { 0 }; int ret; if (replace_len == 0) @@ -2314,15 +2489,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->qgroup_reserved, &key); } else { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = extent_info->disk_offset, + .num_bytes = extent_info->disk_len, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; u64 ref_offset; - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - extent_info->disk_offset, - extent_info->disk_len, 0, - root->root_key.objectid); ref_offset = extent_info->file_offset - extent_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset, 0, false); + btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 93f5c57ea4e3..89f0650631cd 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -9,7 +9,6 @@ #include <linux/compiler.h> #include <linux/math.h> #include <linux/atomic.h> -#include <linux/blkdev.h> #include <linux/percpu_counter.h> #include <linux/completion.h> #include <linux/lockdep.h> @@ -630,6 +629,10 @@ struct btrfs_fs_info { s32 dirty_metadata_batch; s32 delalloc_batch; + struct percpu_counter evictable_extent_maps; + u64 extent_map_shrinker_last_root; + u64 extent_map_shrinker_last_ino; + /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 9c1394c0a6d7..84a94d19b22c 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -670,16 +670,18 @@ delete: } if (del_item && extent_start != 0 && !control->skip_ref_updates) { - struct btrfs_ref ref = { 0 }; + struct btrfs_ref ref = { + .action = BTRFS_DROP_DELAYED_REF, + .bytenr = extent_start, + .num_bytes = extent_num_bytes, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_header_owner(leaf), + }; bytes_deleted += extent_num_bytes; - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - control->ino, extent_offset, - root->root_key.objectid, false); + btrfs_init_data_ref(&ref, control->ino, extent_offset, + btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f454ba349683..753db965f7c0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -254,7 +254,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, btrfs_ino(inode), file_off, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); @@ -264,7 +264,7 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off logical += file_off; btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - inode->root->root_key.objectid, + btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -331,15 +331,15 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); /* Output without objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -347,7 +347,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } else { btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), @@ -512,12 +512,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, - struct page **compressed_pages, + struct folio *compressed_folio, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct page *page = NULL; + const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; @@ -525,10 +526,23 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; u64 i_size; - ASSERT((compressed_size > 0 && compressed_pages) || - (compressed_size == 0 && !compressed_pages)); + /* + * The decompressed size must still be no larger than a sector. Under + * heavy race, we can have size == 0 passed in, but that shouldn't be a + * big deal and we can continue the insertion. + */ + ASSERT(size <= sectorsize); + + /* + * The compressed size also needs to be no larger than a sector. + * That's also why we only need one page as the parameter. + */ + if (compressed_folio) + ASSERT(compressed_size <= sectorsize); + else + ASSERT(compressed_size == 0); - if (compressed_size && compressed_pages) + if (compressed_size && compressed_folio) cur_size = compressed_size; if (!extent_inserted) { @@ -556,21 +570,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { - struct page *cpage; - int i = 0; - while (compressed_size > 0) { - cpage = compressed_pages[i]; - cur_size = min_t(unsigned long, compressed_size, - PAGE_SIZE); - - kaddr = kmap_local_page(cpage); - write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_local(kaddr); + kaddr = kmap_local_folio(compressed_folio, 0); + write_extent_buffer(leaf, kaddr, ptr, compressed_size); + kunmap_local(kaddr); - i++; - ptr += cur_size; - compressed_size -= cur_size; - } btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { @@ -611,17 +614,62 @@ fail: return ret; } +static bool can_cow_file_range_inline(struct btrfs_inode *inode, + u64 offset, u64 size, + size_t compressed_size) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 data_len = (compressed_size ?: size); + + /* Inline extents must start at offset 0. */ + if (offset != 0) + return false; + + /* + * Due to the page size limit, for subpage we can only trigger the + * writeback for the dirty sectors of page, that means data writeback + * is doing more writeback than what we want. + * + * This is especially unexpected for some call sites like fallocate, + * where we only increase i_size after everything is done. + * This means we can trigger inline extent even if we didn't want to. + * So here we skip inline extent creation completely. + */ + if (fs_info->sectorsize != PAGE_SIZE) + return false; + + /* Inline extents are limited to sectorsize. */ + if (size > fs_info->sectorsize) + return false; + + /* We cannot exceed the maximum inline data size. */ + if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + return false; + + /* We cannot exceed the user specified max_inline size. */ + if (data_len > fs_info->max_inline) + return false; + + /* Inline extents must be the entirety of the file. */ + if (size < i_size_read(&inode->vfs_inode)) + return false; + + return true; +} /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. + * + * If being used directly, you must have already checked we're allowed to cow + * the range by getting true from can_cow_file_range_inline(). */ -static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, - size_t compressed_size, - int compress_type, - struct page **compressed_pages, - bool update_i_size) +static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset, + u64 size, size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; @@ -631,18 +679,6 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, int ret; struct btrfs_path *path; - /* - * We can create an inline extent if it ends at or beyond the current - * i_size, is no larger than a sector (decompressed), and the (possibly - * compressed) data fits in a leaf and the configured maximum inline - * size. - */ - if (size < i_size_read(&inode->vfs_inode) || - size > fs_info->sectorsize || - data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || - data_len > fs_info->max_inline) - return 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -668,7 +704,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, - compressed_pages, update_i_size); + compressed_folio, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -701,12 +737,44 @@ out: return ret; } +static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 offset, + u64 end, + size_t compressed_size, + int compress_type, + struct folio *compressed_folio, + bool update_i_size) +{ + struct extent_state *cached = NULL; + unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; + u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); + int ret; + + if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) + return 1; + + lock_extent(&inode->io_tree, offset, end, &cached); + ret = __cow_file_range_inline(inode, offset, size, compressed_size, + compress_type, compressed_folio, + update_i_size); + if (ret > 0) { + unlock_extent(&inode->io_tree, offset, end, &cached); + return ret; + } + + extent_clear_unlock_delalloc(inode, offset, end, NULL, &cached, + clear_flags, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK); + return ret; +} + struct async_extent { u64 start; u64 ram_size; u64 compressed_size; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; int compress_type; struct list_head list; }; @@ -731,8 +799,8 @@ struct async_cow { static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, - struct page **pages, - unsigned long nr_pages, + struct folio **folios, + unsigned long nr_folios, int compress_type) { struct async_extent *async_extent; @@ -743,8 +811,8 @@ static noinline int add_async_extent(struct async_chunk *cow, async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; - async_extent->pages = pages; - async_extent->nr_pages = nr_pages; + async_extent->folios = folios; + async_extent->nr_folios = nr_folios; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; @@ -848,8 +916,8 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages; - unsigned long nr_pages; + struct folio **folios; + unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int poff; @@ -879,9 +947,9 @@ static void compress_file_range(struct btrfs_work *work) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - pages = NULL; - nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; - nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); + folios = NULL; + nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; + nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through @@ -930,8 +998,8 @@ again: if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { + folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); + if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. @@ -945,9 +1013,9 @@ again: compress_type = inode->prop_compress; /* Compression level is applied here. */ - ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), - mapping, start, pages, &nr_pages, &total_in, - &total_compressed); + ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + mapping, start, folios, &nr_folios, &total_in, + &total_compressed); if (ret) goto mark_incompressible; @@ -957,7 +1025,7 @@ again: */ poff = offset_in_page(total_compressed); if (poff) - memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); + folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff); /* * Try to create an inline extent. @@ -968,43 +1036,16 @@ again: * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - if (total_in < actual_end) { - ret = cow_file_range_inline(inode, actual_end, 0, - BTRFS_COMPRESS_NONE, NULL, - false); - } else { - ret = cow_file_range_inline(inode, actual_end, - total_compressed, - compress_type, pages, - false); - } - if (ret <= 0) { - unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING; - - if (ret < 0) - mapping_set_error(mapping, -EIO); - - /* - * inline extent creation worked or returned error, - * we don't need to create any more async work items. - * Unlock and free up our temp pages. - * - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be done _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - NULL, - clear_flags, - PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - goto free_pages; - } + if (total_in < actual_end) + ret = cow_file_range_inline(inode, start, end, 0, + BTRFS_COMPRESS_NONE, NULL, false); + else + ret = cow_file_range_inline(inode, start, end, total_compressed, + compress_type, folios[0], false); + if (ret <= 0) { + if (ret < 0) + mapping_set_error(mapping, -EIO); + goto free_pages; } /* @@ -1026,8 +1067,8 @@ again: * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ - ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages, - nr_pages, compress_type); + ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, + nr_folios, compress_type); BUG_ON(ret); if (start + total_in < end) { start += total_in; @@ -1044,12 +1085,12 @@ cleanup_and_bail_uncompressed: BTRFS_COMPRESS_NONE); BUG_ON(ret); free_pages: - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - btrfs_free_compr_page(pages[i]); + if (folios) { + for (i = 0; i < nr_folios; i++) { + WARN_ON(folios[i]->mapping); + btrfs_free_compr_folio(folios[i]); } - kfree(pages); + kfree(folios); } } @@ -1057,16 +1098,16 @@ static void free_async_extent_pages(struct async_extent *async_extent) { int i; - if (!async_extent->pages) + if (!async_extent->folios) return; - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - btrfs_free_compr_page(async_extent->pages[i]); + for (i = 0; i < async_extent->nr_folios; i++) { + WARN_ON(async_extent->folios[i]->mapping); + btrfs_free_compr_folio(async_extent->folios[i]); } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + kfree(async_extent->folios); + async_extent->nr_folios = 0; + async_extent->folios = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, @@ -1113,6 +1154,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct btrfs_ordered_extent *ordered; struct btrfs_key ins; struct page *locked_page = NULL; + struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; u64 start = async_extent->start; @@ -1132,7 +1174,6 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, if (!(start >= locked_page_end || end <= locked_page_start)) locked_page = async_chunk->locked_page; } - lock_extent(io_tree, start, end, NULL); if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { submit_uncompressed_range(inode, async_extent, locked_page); @@ -1154,6 +1195,8 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, goto done; } + lock_extent(io_tree, start, end, &cached); + /* Here we're doing allocation and writeback of the compressed pages */ em = create_io_em(inode, start, async_extent->ram_size, /* len */ @@ -1187,11 +1230,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, - async_extent->pages, /* compressed_pages */ - async_extent->nr_pages, + async_extent->folios, /* compressed_folios */ + async_extent->nr_folios, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: @@ -1205,7 +1248,8 @@ out_free_reserve: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + NULL, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | @@ -1215,7 +1259,7 @@ out_free_reserve: kthread_associate_blkcg(NULL); btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - root->root_key.objectid, btrfs_ino(inode), start, + btrfs_root_id(root), btrfs_ino(inode), start, async_extent->ram_size, ret); kfree(async_extent); } @@ -1287,6 +1331,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_state *cached = NULL; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; @@ -1312,53 +1357,21 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { - u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), - end + 1); - + if (!no_inline) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, actual_end, 0, + ret = cow_file_range_inline(inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); - if (ret == 0) { - /* - * We use DO_ACCOUNTING here because we need the - * delalloc_release_metadata to be run _after_ we drop - * our outstanding extent for clearing delalloc for this - * range. - */ - extent_clear_unlock_delalloc(inode, start, end, - locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + if (ret <= 0) { /* - * locked_page is locked by the caller of - * writepage_delalloc(), not locked by - * __process_pages_contig(). + * We succeeded, return 1 so the caller knows we're done + * with this page and already handled the IO. * - * We can't let __process_pages_contig() to unlock it, - * as it doesn't have any subpage::writers recorded. - * - * Here we manually unlock the page, since the caller - * can't determine if it's an inline extent or a - * compressed extent. + * If there was an error then cow_file_range_inline() has + * already done the cleanup. */ - unlock_page(locked_page); - ret = 1; + if (ret == 0) + ret = 1; goto done; - } else if (ret < 0) { - goto out_unlock; } } @@ -1418,6 +1431,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode, extent_reserved = true; ram_size = ins.offset; + + lock_extent(&inode->io_tree, start, start + ram_size - 1, + &cached); + em = create_io_em(inode, start, ins.offset, /* len */ start, /* orig_start */ ins.objectid, /* block_start */ @@ -1427,6 +1444,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, BTRFS_COMPRESS_NONE, /* compress_type */ BTRFS_ORDERED_REGULAR /* type */); if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, start, + start + ram_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } @@ -1437,6 +1456,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, 0, 1 << BTRFS_ORDERED_REGULAR, BTRFS_COMPRESS_NONE); if (IS_ERR(ordered)) { + unlock_extent(&inode->io_tree, start, + start + ram_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1476,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, - locked_page, + locked_page, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) @@ -1535,10 +1556,17 @@ out_unlock: if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_page, 0, page_ops); + locked_page, NULL, 0, page_ops); } /* + * At this point we're unlocked, we want to make sure we're only + * clearing these flags under the extent lock, so lock the rest of the + * range and clear everything up. + */ + lock_extent(&inode->io_tree, start, end, NULL); + + /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, * then it means that when we reserved the extent we decremented the @@ -1551,7 +1579,7 @@ out_unlock: if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_page, + locked_page, &cached, clear_bits, page_ops); start += cur_alloc_size; @@ -1566,7 +1594,7 @@ out_unlock: if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); + &cached, clear_bits, page_ops); } return ret; } @@ -1639,7 +1667,6 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, if (!ctx) return false; - unlock_extent(&inode->io_tree, start, end, NULL); set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; @@ -1733,29 +1760,6 @@ static noinline int run_delalloc_cow(struct btrfs_inode *inode, return 1; } -static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, - u64 bytenr, u64 num_bytes, bool nowait) -{ - struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); - struct btrfs_ordered_sum *sums; - int ret; - LIST_HEAD(list); - - ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, - &list, 0, nowait); - if (ret == 0 && list_empty(&list)) - return 0; - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del(&sums->list); - kfree(sums); - } - if (ret < 0) - return ret; - return 1; -} - static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end) { @@ -1763,6 +1767,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_state *cached_state = NULL; u64 range_start = start; u64 count; int ret; @@ -1799,6 +1804,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ + lock_extent(io_tree, start, end, &cached_state); count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { @@ -1817,6 +1823,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, NULL); } + unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that @@ -1870,6 +1877,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; + struct btrfs_root *csum_root; u64 extent_end; u8 extent_type; int can_nocow = 0; @@ -1930,7 +1938,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, if (args->free_path) { /* * We don't need the path anymore, plus through the - * csum_exist_in_range() call below we will end up allocating + * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage. */ @@ -1951,8 +1959,11 @@ static int can_nocow_file_extent(struct btrfs_path *path, * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ - ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, - nowait); + + csum_root = btrfs_csum_root(root->fs_info, args->disk_bytenr); + ret = btrfs_lookup_csums_list(csum_root, args->disk_bytenr, + args->disk_bytenr + args->num_bytes - 1, + NULL, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -2002,12 +2013,13 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.end = end; nocow_args.writeback_path = true; - while (1) { + while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; + struct extent_state *cached_state = NULL; u64 extent_end; u64 ram_bytes; u64 nocow_end; @@ -2145,6 +2157,8 @@ must_cow: } nocow_end = cur_offset + nocow_args.num_bytes - 1; + lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); + is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; if (is_prealloc) { u64 orig_start = found_key.offset - nocow_args.extent_offset; @@ -2158,6 +2172,8 @@ must_cow: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { + unlock_extent(&inode->io_tree, cur_offset, + nocow_end, &cached_state); btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; @@ -2178,6 +2194,8 @@ must_cow: btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); } + unlock_extent(&inode->io_tree, cur_offset, + nocow_end, &cached_state); ret = PTR_ERR(ordered); goto error; } @@ -2192,8 +2210,8 @@ must_cow: btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | + locked_page, &cached_state, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); @@ -2206,8 +2224,6 @@ must_cow: */ if (ret) goto error; - if (cur_offset > end) - break; } btrfs_release_path(path); @@ -2233,13 +2249,23 @@ error: */ if (cow_start != (u64)-1) cur_offset = cow_start; - if (cur_offset < end) + + /* + * We need to lock the extent here because we're clearing DELALLOC and + * we're not locked at this point. + */ + if (cur_offset < end) { + struct extent_state *cached = NULL; + + lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, - locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_DEFRAG | + locked_page, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + } btrfs_free_path(path); return ret; } @@ -3181,7 +3207,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) btrfs_abort_transaction(trans, ret); goto out; } - ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, &cached_state); @@ -3200,9 +3225,8 @@ out: * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ - if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, - &ordered_extent->flags)) - mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + if (ret) + btrfs_mark_ordered_extent_error(ordered_extent); if (truncated) unwritten_start += logical_len; @@ -3256,7 +3280,7 @@ out: * Actually free the qgroup rsv which was released when * the ordered extent was created. */ - btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } @@ -3923,7 +3947,7 @@ cache_acl: btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), - root->root_key.objectid, ret); + btrfs_root_id(root), ret); } if (path != in_path) btrfs_free_path(path); @@ -4282,7 +4306,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { - objectid = inode->root->root_key.objectid; + objectid = btrfs_root_id(inode->root); } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { objectid = inode->location.objectid; } else { @@ -4340,7 +4364,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_release_path(path); } else { ret = btrfs_del_root_ref(trans, objectid, - root->root_key.objectid, dir_ino, + btrfs_root_id(root), dir_ino, &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -4390,7 +4414,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); - if (key.objectid == root->root_key.objectid) { + if (key.objectid == btrfs_root_id(root)) { ret = -EPERM; btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", @@ -4400,7 +4424,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_release_path(path); } - key.objectid = root->root_key.objectid; + key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; @@ -4420,8 +4444,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == root->root_key.objectid && - key.type == BTRFS_ROOT_REF_KEY) + if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } out: @@ -4433,64 +4456,26 @@ out: static void btrfs_prune_dentries(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - u64 objectid = 0; + struct btrfs_inode *inode; + u64 min_ino = 0; if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = btrfs_find_first_inode(root, min_ino); + while (inode) { + if (atomic_read(&inode->vfs_inode.i_count) > 1) + d_prune_aliases(&inode->vfs_inode); - if (objectid < btrfs_ino(entry)) - node = node->rb_left; - else if (objectid > btrfs_ino(entry)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(entry)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - objectid = btrfs_ino(entry) + 1; - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - if (atomic_read(&inode->i_count) > 1) - d_prune_aliases(inode); - /* - * btrfs_drop_inode will have it removed from the inode - * cache when its usage count hits zero. - */ - iput(inode); - cond_resched(); - spin_lock(&root->inode_lock); - goto again; - } - - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); + min_ino = btrfs_ino(inode) + 1; + /* + * btrfs_drop_inode() will have it removed from the inode + * cache when its usage count hits zero. + */ + iput(&inode->vfs_inode); + cond_resched(); + inode = btrfs_find_first_inode(root, min_ino); } - spin_unlock(&root->inode_lock); } int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) @@ -4517,7 +4502,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", - dest->root_key.objectid); + btrfs_root_id(dest)); ret = -EPERM; goto out_up_write; } @@ -4525,7 +4510,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_up_write; } @@ -4586,7 +4571,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4594,8 +4579,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) } ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, - BTRFS_UUID_KEY_SUBVOL, - dest->root_key.objectid); + BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4604,7 +4588,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) ret = btrfs_uuid_tree_remove(trans, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - dest->root_key.objectid); + btrfs_root_id(dest)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4645,7 +4629,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - int err = 0; + int ret = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; struct fscrypt_name fname; @@ -4661,33 +4645,33 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } - err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); - if (err) - return err; + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_notrans; } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } - err = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) goto out; last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); - if (!err) { + if (!ret) { btrfs_i_size_write(BTRFS_I(inode), 0); /* * Propagate the last_unlink_trans value of the deleted dir to @@ -4709,7 +4693,7 @@ out_notrans: btrfs_btree_balance_dirty(fs_info); fscrypt_free_filename(&fname); - return err; + return ret; } /* @@ -4933,16 +4917,16 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err = 0; + int ret = 0; /* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - err = btrfs_truncate_block(inode, oldsize, 0, 0); - if (err) - return err; + ret = btrfs_truncate_block(inode, oldsize, 0, 0); + if (ret) + return ret; if (size <= hole_start) return 0; @@ -4953,7 +4937,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) while (1) { em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { - err = PTR_ERR(em); + ret = PTR_ERR(em); em = NULL; break; } @@ -4964,13 +4948,13 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; - err = maybe_insert_hole(inode, cur_offset, hole_size); - if (err) + ret = maybe_insert_hole(inode, cur_offset, hole_size); + if (ret) break; - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; hole_em = alloc_extent_map(); @@ -4991,12 +4975,12 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->ram_bytes = hole_size; hole_em->generation = btrfs_get_fs_generation(fs_info); - err = btrfs_replace_extent_map_range(inode, hole_em, true); + ret = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { - err = btrfs_inode_set_file_extent_range(inode, + ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); - if (err) + if (ret) break; } next: @@ -5008,7 +4992,7 @@ next: } free_extent_map(em); unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); - return err; + return ret; } static int btrfs_setsize(struct inode *inode, struct iattr *attr) @@ -5284,7 +5268,7 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) goto out; @@ -5296,7 +5280,7 @@ void btrfs_evict_inode(struct inode *inode) if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && - root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); + btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID); goto out; } @@ -5468,7 +5452,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = dir->root->root_key.objectid; + key.objectid = btrfs_root_id(dir->root); key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -6427,8 +6411,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (ret) { btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, - ret); + btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret); } /* @@ -6501,7 +6484,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, index, name); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, @@ -6544,7 +6527,7 @@ fail_dir_item: u64 local_index; int err; err = btrfs_del_root_ref(trans, key.objectid, - root->root_key.objectid, parent_ino, + btrfs_root_id(root), parent_ino, &local_index, name); if (err) btrfs_abort_transaction(trans, err); @@ -6642,7 +6625,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ - if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) + if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root)) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) @@ -6989,7 +6972,7 @@ insert: } write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); @@ -7316,11 +7299,49 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, struct extent_map *em; int ret; + /* + * Note the missing NOCOW type. + * + * For pure NOCOW writes, we should not create an io extent map, but + * just reusing the existing one. + * Only PREALLOC writes (NOCOW write into preallocated range) can + * create an io extent map. + */ ASSERT(type == BTRFS_ORDERED_PREALLOC || type == BTRFS_ORDERED_COMPRESSED || - type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); + switch (type) { + case BTRFS_ORDERED_PREALLOC: + /* Uncompressed extents. */ + ASSERT(block_len == len); + + /* We're only referring part of a larger preallocated extent. */ + ASSERT(block_len <= ram_bytes); + break; + case BTRFS_ORDERED_REGULAR: + /* Uncompressed extents. */ + ASSERT(block_len == len); + + /* COW results a new extent matching our file extent size. */ + ASSERT(orig_block_len == len); + ASSERT(ram_bytes == len); + + /* Since it's a new extent, we should not have any offset. */ + ASSERT(orig_start == start); + break; + case BTRFS_ORDERED_COMPRESSED: + /* Must be compressed. */ + ASSERT(compress_type != BTRFS_COMPRESS_NONE); + + /* + * Encoded write can make us to refer to part of the + * uncompressed extent. + */ + ASSERT(len <= ram_bytes); + break; + } + em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7334,9 +7355,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, em->ram_bytes = ram_bytes; em->generation = -1; em->flags |= EXTENT_FLAG_PINNED; - if (type == BTRFS_ORDERED_PREALLOC) - em->flags |= EXTENT_FLAG_FILLING; - else if (type == BTRFS_ORDERED_COMPRESSED) + if (type == BTRFS_ORDERED_COMPRESSED) extent_map_set_compression(em, compress_type); ret = btrfs_replace_extent_map_range(inode, em, true); @@ -7923,17 +7942,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } -static int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return extent_writepages(mapping, wbc); -} - -static void btrfs_readahead(struct readahead_control *rac) -{ - extent_readahead(rac); -} - /* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. @@ -7970,13 +7978,12 @@ static void wait_subpage_spinlock(struct page *page) static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - int ret = try_release_extent_mapping(&folio->page, gfp_flags); - - if (ret == 1) { + if (try_release_extent_mapping(&folio->page, gfp_flags)) { wait_subpage_spinlock(&folio->page); clear_page_extent_mapped(&folio->page); + return true; } - return ret; + return false; } static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -8174,173 +8181,6 @@ next: clear_page_extent_mapped(&folio->page); } -/* - * btrfs_page_mkwrite() is not allowed to change the file size as it gets - * called from a page fault handler when a page is first dirtied. Hence we must - * be careful to check for EOF conditions here. We set the page up correctly - * for a written page which means we get ENOSPC checking when writing into - * holes and correct delalloc and unwritten extent mapping on filesystems that - * support these features. - * - * We are not allowed to take the i_mutex here so we have to play games to - * protect against truncate races as the page could now be beyond EOF. Because - * truncate_setsize() writes the inode size before removing pages, once we have - * the page lock we can determine safely if the page is beyond EOF. If it is not - * beyond EOF, then the page is guaranteed safe against truncation until we - * unlock the page. - */ -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct folio *folio = page_folio(page); - struct inode *inode = file_inode(vmf->vma->vm_file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_changeset *data_reserved = NULL; - unsigned long zero_start; - loff_t size; - vm_fault_t ret; - int ret2; - int reserved = 0; - u64 reserved_space; - u64 page_start; - u64 page_end; - u64 end; - - ASSERT(folio_order(folio) == 0); - - reserved_space = PAGE_SIZE; - - sb_start_pagefault(inode->i_sb); - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - end = page_end; - - /* - * Reserving delalloc space after obtaining the page lock can lead to - * deadlock. For example, if a dirty page is locked by this function - * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepages() function could - * end up waiting indefinitely to get a lock on the page currently - * being processed by btrfs_page_mkwrite() function. - */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; - goto out_noreserve; - } - - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ -again: - down_read(&BTRFS_I(inode)->i_mmap_lock); - lock_page(page); - size = i_size_read(inode); - - if ((page->mapping != inode->i_mapping) || - (page_start >= size)) { - /* page got truncated out from underneath us */ - goto out_unlock; - } - wait_on_page_writeback(page); - - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_page_extent_mapped(page); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); - goto out_unlock; - } - - /* - * we can't set the delalloc bits if there are pending ordered - * extents. Drop our locks and wait for them to finish - */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, - PAGE_SIZE); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_start_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); - goto again; - } - - if (page->index == ((size - 1) >> PAGE_SHIFT)) { - reserved_space = round_up(size - page_start, - fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { - end = page_start + reserved_space - 1; - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); - } - } - - /* - * page_mkwrite gets called when the page is firstly dirtied after it's - * faulted in, but write(2) could also dirty a page and set delalloc - * bits, thus in this case for space account reason, we still need to - * clear any delalloc bits within this page range since we have to - * reserve data&meta space before lock_page() (see above comments). - */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); - - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, - &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; - goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (page_start + PAGE_SIZE > size) - zero_start = offset_in_page(size); - else - zero_start = PAGE_SIZE; - - if (zero_start != PAGE_SIZE) - memzero_page(page, zero_start, PAGE_SIZE - zero_start); - - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); - btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); - btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); - - btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - - unlock_extent(io_tree, page_start, page_end, &cached_state); - up_read(&BTRFS_I(inode)->i_mmap_lock); - - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - -out_unlock: - unlock_page(page); - up_read(&BTRFS_I(inode)->i_mmap_lock); -out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); -out_noreserve: - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return ret; -} - static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) { struct btrfs_truncate_control control = { @@ -9671,7 +9511,7 @@ free_qgroup: * or we leak qgroup data reservation. */ btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, qgroup_released, + btrfs_root_id(inode->root), qgroup_released, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret); } @@ -10319,8 +10159,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, size_t orig_count; u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; - unsigned long nr_pages, i; - struct page **pages; + unsigned long nr_folios, i; + struct folio **folios; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; @@ -10409,24 +10249,24 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); - nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); - pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); - if (!pages) + nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!folios) return -ENOMEM; - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_folios; i++) { size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr; - pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); - if (!pages[i]) { + folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); + if (!folios[i]) { ret = -ENOMEM; - goto out_pages; + goto out_folios; } - kaddr = kmap_local_page(pages[i]); + kaddr = kmap_local_folio(folios[i], 0); if (copy_from_iter(kaddr, bytes, from) != bytes) { kunmap_local(kaddr); ret = -EFAULT; - goto out_pages; + goto out_folios; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); @@ -10438,12 +10278,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); if (ret) - goto out_pages; + goto out_folios; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) - goto out_pages; + goto out_folios; lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && @@ -10471,10 +10311,12 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, goto out_qgroup_free_data; /* Try an inline extent first. */ - if (start == 0 && encoded->unencoded_len == encoded->len && - encoded->unencoded_offset == 0) { - ret = cow_file_range_inline(inode, encoded->len, orig_count, - compression, pages, true); + if (encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0 && + can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { + ret = __cow_file_range_inline(inode, start, encoded->len, + orig_count, compression, folios[0], + true); if (ret <= 0) { if (ret == 0) ret = orig_count; @@ -10518,7 +10360,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); ret = orig_count; goto out; @@ -10540,12 +10382,12 @@ out_free_data_space: btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: unlock_extent(io_tree, start, end, &cached_state); -out_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) - __free_page(pages[i]); +out_folios: + for (i = 0; i < nr_folios; i++) { + if (folios[i]) + __folio_put(folios[i]); } - kvfree(pages); + kvfree(folios); out: if (ret >= 0) iocb->ki_pos += encoded->len; @@ -10772,7 +10614,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", - root->root_key.objectid); + btrfs_root_id(root)); return -EPERM; } atomic_inc(&root->nr_swapfiles); @@ -10998,7 +10840,7 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en if (ordered) { btrfs_err(root->fs_info, "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", - start, end, btrfs_ino(inode), root->root_key.objectid, + start, end, btrfs_ino(inode), btrfs_root_id(root), ordered->file_offset, ordered->file_offset + ordered->num_bytes - 1); btrfs_put_ordered_extent(ordered); @@ -11007,6 +10849,65 @@ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 en ASSERT(ordered == NULL); } +/* + * Find the first inode with a minimum number. + * + * @root: The root to search for. + * @min_ino: The minimum inode number. + * + * Find the first inode in the @root with a number >= @min_ino and return it. + * Returns NULL if no such inode found. + */ +struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *inode; + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + inode = rb_entry(node, struct btrfs_inode, rb_node); + if (min_ino < btrfs_ino(inode)) + node = node->rb_left; + else if (min_ino > btrfs_ino(inode)) + node = node->rb_right; + else + break; + } + + if (!node) { + while (prev) { + inode = rb_entry(prev, struct btrfs_inode, rb_node); + if (min_ino <= btrfs_ino(inode)) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + + while (node) { + inode = rb_entry(prev, struct btrfs_inode, rb_node); + if (igrab(&inode->vfs_inode)) { + spin_unlock(&root->inode_lock); + return inode; + } + + min_ino = btrfs_ino(inode) + 1; + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + + return NULL; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0493272a7668..efd5d6e9589e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -668,7 +668,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, /* Tree log can't currently deal with an inode which is a new root. */ btrfs_set_log_full_commit(trans); - ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit); + ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit); if (ret) goto out; @@ -1510,7 +1510,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, spin_unlock(&root->root_item_lock); btrfs_warn(fs_info, "Attempt to set subvolume %llu read-write during send", - root->root_key.objectid); + btrfs_root_id(root)); ret = -EPERM; goto out_drop_sem; } @@ -1919,7 +1919,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct super_block *sb = inode->i_sb; struct btrfs_key upper_limit = BTRFS_I(inode)->location; - u64 treeid = BTRFS_I(inode)->root->root_key.objectid; + u64 treeid = btrfs_root_id(BTRFS_I(inode)->root); u64 dirid = args->dirid; unsigned long item_off; unsigned long item_len; @@ -2091,7 +2091,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) - args->treeid = root->root_key.objectid; + args->treeid = btrfs_root_id(root); if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; @@ -2187,7 +2187,7 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ - key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.objectid = btrfs_root_id(BTRFS_I(inode)->root); root = btrfs_get_fs_root(fs_info, key.objectid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); @@ -2302,7 +2302,7 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, return PTR_ERR(rootrefs); } - objectid = root->root_key.objectid; + objectid = btrfs_root_id(root); key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; @@ -2386,7 +2386,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; - int err = 0; + int ret = 0; bool destroy_parent = false; /* We don't support snapshots with extent tree v2 yet. */ @@ -2402,7 +2402,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, return PTR_ERR(vol_args2); if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { - err = -EOPNOTSUPP; + ret = -EOPNOTSUPP; goto out; } @@ -2411,31 +2411,31 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { - err = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); - if (err < 0) + ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); + if (ret < 0) goto out; subvol_name = vol_args2->name; - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; } else { struct inode *old_dir; if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; + ret = -EINVAL; goto out; } - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, vol_args2->subvolid, 0); if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); + ret = PTR_ERR(dentry); goto out_drop_write; } @@ -2455,7 +2455,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ dput(dentry); if (IS_ERR(parent)) { - err = PTR_ERR(parent); + ret = PTR_ERR(parent); goto out_drop_write; } old_dir = dir; @@ -2479,14 +2479,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * to delete without an idmapped mount. */ if (old_dir != dir && idmap != &nop_mnt_idmap) { - err = -EOPNOTSUPP; + ret = -EOPNOTSUPP; goto free_parent; } subvol_name_ptr = btrfs_get_subvol_name_from_objectid( fs_info, vol_args2->subvolid); if (IS_ERR(subvol_name_ptr)) { - err = PTR_ERR(subvol_name_ptr); + ret = PTR_ERR(subvol_name_ptr); goto free_parent; } /* subvol_name_ptr is already nul terminated */ @@ -2497,14 +2497,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (IS_ERR(vol_args)) return PTR_ERR(vol_args); - err = btrfs_check_ioctl_vol_args_path(vol_args); - if (err < 0) + ret = btrfs_check_ioctl_vol_args_path(vol_args); + if (ret < 0) goto out; subvol_name = vol_args->name; - err = mnt_want_write_file(file); - if (err) + ret = mnt_want_write_file(file); + if (ret) goto out; } @@ -2512,26 +2512,26 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (strchr(subvol_name, '/') || strncmp(subvol_name, "..", subvol_namelen) == 0) { - err = -EINVAL; + ret = -EINVAL; goto free_subvol_name; } if (!S_ISDIR(dir->i_mode)) { - err = -ENOTDIR; + ret = -ENOTDIR; goto free_subvol_name; } - err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); - if (err == -EINTR) + ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (ret == -EINTR) goto free_subvol_name; dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); + ret = PTR_ERR(dentry); goto out_unlock_dir; } if (d_really_is_negative(dentry)) { - err = -ENOENT; + ret = -ENOENT; goto out_dput; } @@ -2551,7 +2551,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * Users who want to delete empty subvols should try * rmdir(2). */ - err = -EPERM; + ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; @@ -2562,29 +2562,29 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * of the subvol, not a random directory contained * within it. */ - err = -EINVAL; + ret = -EINVAL; if (root == dest) goto out_dput; - err = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); - if (err) + ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); + if (ret) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(idmap, dir, dentry, 1); - if (err) + ret = btrfs_may_delete(idmap, dir, dentry, 1); + if (ret) goto out_dput; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { - err = -EINVAL; + ret = -EINVAL; goto out_dput; } btrfs_inode_lock(BTRFS_I(inode), 0); - err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); + ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry); btrfs_inode_unlock(BTRFS_I(inode), 0); - if (!err) + if (!ret) d_delete_notify(dir, dentry); out_dput: @@ -2601,7 +2601,7 @@ out_drop_write: out: kfree(vol_args2); kfree(vol_args); - return err; + return ret; } static int btrfs_ioctl_defrag(struct file *file, void __user *argp) @@ -2981,7 +2981,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } - if (!is_fstree(new_root->root_key.objectid)) { + if (!is_fstree(btrfs_root_id(new_root))) { ret = -ENOENT; goto out_free; } @@ -3947,7 +3947,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) qgroupid = sa->qgroupid; if (!qgroupid) { /* take the current subvol as qgroup */ - qgroupid = root->root_key.objectid; + qgroupid = btrfs_root_id(root); } ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); @@ -4078,7 +4078,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, !btrfs_is_empty_uuid(root_item->received_uuid)) { ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + btrfs_root_id(root)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4102,7 +4102,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, - root->root_key.objectid); + btrfs_root_id(root)); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 99ccab86bb86..6a0b7abb5bd9 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -97,7 +97,7 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) { if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) - btrfs_set_buffer_lockdep_class(root->root_key.objectid, + btrfs_set_buffer_lockdep_class(btrfs_root_id(root), eb, btrfs_header_level(eb)); } @@ -129,14 +129,14 @@ static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } */ /* - * __btrfs_tree_read_lock - lock extent buffer for read + * btrfs_tree_read_lock_nested - lock extent buffer for read * @eb: the eb to be locked * @nest: the nesting level to be used for lockdep * * This takes the read lock on the extent buffer, using the specified nesting * level for lockdep purposes. */ -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) +void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest) { u64 start_ns = 0; @@ -147,11 +147,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne trace_btrfs_tree_read_lock(eb, start_ns); } -void btrfs_tree_read_lock(struct extent_buffer *eb) -{ - __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); -} - /* * Try-lock for read. * @@ -198,7 +193,7 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) * * Returns with the eb->lock write locked. */ -void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) +void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -211,11 +206,6 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) trace_btrfs_tree_lock(eb, start_ns); } -void btrfs_tree_lock(struct extent_buffer *eb) -{ - __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); -} - /* * Release the write lock. */ @@ -374,8 +364,12 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) { - atomic_dec(&lock->writers); - cond_wake_up(&lock->pending_readers); + /* + * atomic_dec_and_test() implies a full barrier, so woken up readers are + * guaranteed to see the decrement. + */ + if (atomic_dec_and_test(&lock->writers)) + wake_up(&lock->pending_readers); } void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 9576f485a300..1bc8e6738879 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -163,12 +163,22 @@ enum btrfs_lockdep_trans_states { static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); -void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); -void btrfs_tree_lock(struct extent_buffer *eb); +void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest); + +static inline void btrfs_tree_lock(struct extent_buffer *eb) +{ + btrfs_tree_lock_nested(eb, BTRFS_NESTING_NORMAL); +} + void btrfs_tree_unlock(struct extent_buffer *eb); -void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); -void btrfs_tree_read_lock(struct extent_buffer *eb); +void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest); + +static inline void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + btrfs_tree_read_lock_nested(eb, BTRFS_NESTING_NORMAL); +} + void btrfs_tree_read_unlock(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 3e5d3b7028e8..1c396ac167aa 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -130,17 +130,17 @@ static inline size_t read_compress_length(const char *buf) */ static int copy_compressed_data_to_page(char *compressed_data, size_t compressed_size, - struct page **out_pages, - unsigned long max_nr_page, + struct folio **out_folios, + unsigned long max_nr_folio, u32 *cur_out, const u32 sectorsize) { u32 sector_bytes_left; u32 orig_out; - struct page *cur_page; + struct folio *cur_folio; char *kaddr; - if ((*cur_out / PAGE_SIZE) >= max_nr_page) + if ((*cur_out / PAGE_SIZE) >= max_nr_folio) return -E2BIG; /* @@ -149,16 +149,16 @@ static int copy_compressed_data_to_page(char *compressed_data, */ ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); - cur_page = out_pages[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ - if (!cur_page) { - cur_page = btrfs_alloc_compr_page(); - if (!cur_page) + if (!cur_folio) { + cur_folio = btrfs_alloc_compr_folio(); + if (!cur_folio) return -ENOMEM; - out_pages[*cur_out / PAGE_SIZE] = cur_page; + out_folios[*cur_out / PAGE_SIZE] = cur_folio; } - kaddr = kmap_local_page(cur_page); + kaddr = kmap_local_folio(cur_folio, 0); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -172,18 +172,18 @@ static int copy_compressed_data_to_page(char *compressed_data, kunmap_local(kaddr); - if ((*cur_out / PAGE_SIZE) >= max_nr_page) + if ((*cur_out / PAGE_SIZE) >= max_nr_folio) return -E2BIG; - cur_page = out_pages[*cur_out / PAGE_SIZE]; + cur_folio = out_folios[*cur_out / PAGE_SIZE]; /* Allocate a new page */ - if (!cur_page) { - cur_page = btrfs_alloc_compr_page(); - if (!cur_page) + if (!cur_folio) { + cur_folio = btrfs_alloc_compr_folio(); + if (!cur_folio) return -ENOMEM; - out_pages[*cur_out / PAGE_SIZE] = cur_page; + out_folios[*cur_out / PAGE_SIZE] = cur_folio; } - kaddr = kmap_local_page(cur_page); + kaddr = kmap_local_folio(cur_folio, 0); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -209,15 +209,15 @@ out: return 0; } -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); const u32 sectorsize = inode_to_fs_info(mapping->host)->sectorsize; - struct page *page_in = NULL; + struct folio *folio_in = NULL; char *sizes_ptr; - const unsigned long max_nr_page = *out_pages; + const unsigned long max_nr_folio = *out_folios; int ret = 0; /* Points to the file offset of input data */ u64 cur_in = start; @@ -225,8 +225,8 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u32 cur_out = 0; u32 len = *total_out; - ASSERT(max_nr_page > 0); - *out_pages = 0; + ASSERT(max_nr_folio > 0); + *out_folios = 0; *total_out = 0; *total_in = 0; @@ -243,15 +243,16 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, size_t out_len; /* Get the input page first */ - if (!page_in) { - page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT); - ASSERT(page_in); + if (!folio_in) { + ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); + if (ret < 0) + goto out; } /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap_local_page(page_in); + data_in = kmap_local_folio(folio_in, 0); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, @@ -264,7 +265,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } ret = copy_compressed_data_to_page(workspace->cbuf, out_len, - pages, max_nr_page, + folios, max_nr_folio, &cur_out, sectorsize); if (ret < 0) goto out; @@ -282,13 +283,13 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we have reached page boundary */ if (PAGE_ALIGNED(cur_in)) { - put_page(page_in); - page_in = NULL; + folio_put(folio_in); + folio_in = NULL; } } /* Store the size of all chunks of compressed data */ - sizes_ptr = kmap_local_page(pages[0]); + sizes_ptr = kmap_local_folio(folios[0], 0); write_compress_length(sizes_ptr, cur_out); kunmap_local(sizes_ptr); @@ -296,9 +297,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = cur_out; *total_in = cur_in - start; out: - if (page_in) - put_page(page_in); - *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); + if (folio_in) + folio_put(folio_in); + *out_folios = DIV_ROUND_UP(cur_out, PAGE_SIZE); return ret; } @@ -313,15 +314,15 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - struct page *cur_page; + struct folio *cur_folio; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); ASSERT(copy_len); - cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + cur_folio = cb->compressed_folios[*cur_in / PAGE_SIZE]; - memcpy_from_page(dest + *cur_in - orig_in, cur_page, - offset_in_page(*cur_in), copy_len); + memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, + offset_in_folio(cur_folio, *cur_in), copy_len); *cur_in += copy_len; } @@ -341,7 +342,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap_local_page(cb->compressed_pages[0]); + kaddr = kmap_local_folio(cb->compressed_folios[0], 0); len_in = read_compress_length(kaddr); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -363,7 +364,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Go through each lzo segment */ while (cur_in < len_in) { - struct page *cur_page; + struct folio *cur_folio; /* Length of the compressed segment */ u32 seg_len; u32 sector_bytes_left; @@ -375,9 +376,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) */ ASSERT(cur_in / sectorsize == (cur_in + LZO_LEN - 1) / sectorsize); - cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; - ASSERT(cur_page); - kaddr = kmap_local_page(cur_page); + cur_folio = cb->compressed_folios[cur_in / PAGE_SIZE]; + ASSERT(cur_folio); + kaddr = kmap_local_folio(cur_folio, 0); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c2a42bcde98e..c5bdd674f55c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -294,6 +294,12 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, spin_unlock_irq(&inode->ordered_tree_lock); } +void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) +{ + if (!test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + mapping_set_error(ordered->inode->i_mapping, -EIO); +} + static void finish_ordered_fn(struct btrfs_work *work) { struct btrfs_ordered_extent *ordered_extent; @@ -332,7 +338,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, if (WARN_ON_ONCE(len > ordered->bytes_left)) { btrfs_crit(fs_info, "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", - inode->root->root_key.objectid, btrfs_ino(inode), + btrfs_root_id(inode->root), btrfs_ino(inode), ordered->file_offset, ordered->num_bytes, len, ordered->bytes_left); ordered->bytes_left = 0; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 34413fc5b4bd..b6f6c6b91732 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -203,6 +203,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len); +void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 2a9b7b029eeb..155570e20f45 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -268,7 +268,7 @@ static void inode_prop_iterator(void *ctx, btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", handler->xattr_name, btrfs_ino(BTRFS_I(inode)), - root->root_key.objectid, ret); + btrfs_root_id(root), ret); else set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 40e5f7f2fcb7..eb28141d5c37 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1536,18 +1536,15 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info, { struct btrfs_qgroup *qgroup; int ret = 1; - int err = 0; qgroup = find_qgroup_rb(fs_info, src); if (!qgroup) goto out; if (qgroup->excl == qgroup->rfer) { - ret = 0; - err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); - if (err < 0) { - ret = err; + ret = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); + if (ret < 0) goto out; - } + ret = 0; } out: if (ret) @@ -3064,9 +3061,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, if (inherit->num_ref_copies > 0 || inherit->num_excl_copies > 0) return -EINVAL; - if (inherit->num_qgroups > PAGE_SIZE) - return -EINVAL; - if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) return -EINVAL; @@ -3129,7 +3123,7 @@ static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, qgids = res->qgroups; list_for_each_entry(qg_list, &inode_qg->groups, next_group) - qgids[i] = qg_list->group->qgroupid; + qgids[i++] = qg_list->group->qgroupid; *inherit = res; return 0; @@ -3471,7 +3465,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, { struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; - u64 ref_root = root->root_key.objectid; + u64 ref_root = btrfs_root_id(root); int ret = 0; LIST_HEAD(qgroup_list); @@ -3706,7 +3700,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - int err = -ENOMEM; int ret = 0; bool stopped = false; bool did_leaf_rescans = false; @@ -3715,8 +3708,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) return; path = btrfs_alloc_path(); - if (!path) + if (!path) { + ret = -ENOMEM; goto out; + } /* * Rescan should only search for commit root, and any later difference * should be recorded by qgroup @@ -3724,18 +3719,17 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path->search_commit_root = 1; path->skip_locking = 1; - err = 0; - while (!err && !(stopped = rescan_should_stop(fs_info))) { + while (!ret && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); break; } - err = qgroup_rescan_leaf(trans, path); + ret = qgroup_rescan_leaf(trans, path); did_leaf_rescans = true; - if (err > 0) + if (ret > 0) btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); @@ -3745,10 +3739,10 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (err > 0 && + if (ret > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - } else if (err < 0 || stopped) { + } else if (ret < 0 || stopped) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3763,11 +3757,11 @@ out: if (did_leaf_rescans) { trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", - err); + ret); } } else { trans = NULL; @@ -3778,11 +3772,11 @@ out: fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; if (trans) { - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", - err); + int ret2 = update_qgroup_status_item(trans); + + if (ret2 < 0) { + ret = ret2; + btrfs_err(fs_info, "fail to update qgroup status: %d", ret); } } fs_info->qgroup_rescan_running = false; @@ -3799,11 +3793,11 @@ out: btrfs_info(fs_info, "qgroup scan paused"); } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { btrfs_info(fs_info, "qgroup scan cancelled"); - } else if (err >= 0) { + } else if (ret >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", - err > 0 ? " (inconsistency flag cleared)" : ""); + ret > 0 ? " (inconsistency flag cleared)" : ""); } else { - btrfs_err(fs_info, "qgroup scan failed with %d", err); + btrfs_err(fs_info, "qgroup scan failed with %d", ret); } } @@ -4112,7 +4106,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid) || len == 0) + !is_fstree(btrfs_root_id(root)) || len == 0) return 0; /* @reserved parameter is mandatory for qgroup */ @@ -4228,7 +4222,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, goto out; freed += changeset.bytes_changed; } - btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, + btrfs_qgroup_free_refroot(root->fs_info, btrfs_root_id(root), freed, BTRFS_QGROUP_RSV_DATA); if (freed_ret) *freed_ret = freed; @@ -4269,7 +4263,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, changeset.bytes_changed, trace_op); if (free) btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, + btrfs_root_id(inode->root), changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); if (released) *released = changeset.bytes_changed; @@ -4364,7 +4358,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, int ret; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid) || num_bytes == 0) + !is_fstree(btrfs_root_id(root)) || num_bytes == 0) return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); @@ -4409,13 +4403,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid)) + !is_fstree(btrfs_root_id(root))) return; /* TODO: Update trace point to handle such free */ trace_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ - btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -4425,7 +4419,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid)) + !is_fstree(btrfs_root_id(root))) return; /* @@ -4436,8 +4430,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); - btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, - num_bytes, type); + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, @@ -4485,13 +4478,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || - !is_fstree(root->root_key.objectid)) + !is_fstree(btrfs_root_id(root))) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); trace_qgroup_meta_convert(root, num_bytes); - qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); + qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); if (!sb_rdonly(fs_info->sb)) add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); } @@ -4520,7 +4513,7 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) btrfs_ino(inode), unode->val, unode->aux); } btrfs_qgroup_free_refroot(inode->root->fs_info, - inode->root->root_key.objectid, + btrfs_root_id(inode->root), changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); } @@ -4706,7 +4699,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, if (!btrfs_qgroup_full_accounting(fs_info)) return 0; - if (!is_fstree(root->root_key.objectid) || !root->reloc_root) + if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root) return 0; spin_lock(&blocks->lock); diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 8c4fc98ca9ce..cf531255ab76 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -673,7 +673,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, int ret = 0; bool metadata; u64 bytenr = generic_ref->bytenr; - u64 num_bytes = generic_ref->len; + u64 num_bytes = generic_ref->num_bytes; u64 parent = generic_ref->parent; u64 ref_root = 0; u64 owner = 0; @@ -684,11 +684,11 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.ref_root; + ref_root = generic_ref->ref_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.ref_root; - owner = generic_ref->data_ref.ino; + ref_root = generic_ref->ref_root; + owner = generic_ref->data_ref.objectid; offset = generic_ref->data_ref.offset; } metadata = owner < BTRFS_FIRST_FREE_OBJECTID; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 08d0fb46ceec..d0a3fcecc46a 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -616,35 +616,6 @@ out: return ret; } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); -} - -static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - u64 range1_end = loff1 + len - 1; - u64 range2_end = loff2 + len - 1; - - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - swap(range1_end, range2_end); - } else if (inode1 == inode2 && loff2 < loff1) { - swap(loff1, loff2); - swap(range1_end, range2_end); - } - - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); - - btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); - btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); -} - static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) { if (inode1 < inode2) @@ -662,17 +633,21 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { + const u64 end = dst_loff + len - 1; + struct extent_state *cached_state = NULL; struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; /* - * Lock destination range to serialize with concurrent readahead() and - * source range to serialize with relocation. + * Lock destination range to serialize with concurrent readahead(), and + * we are safe from concurrency with relocation of source extents + * because we have already locked the inode's i_mmap_lock in exclusive + * mode. */ - btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -690,7 +665,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (root_dst->send_in_progress) { btrfs_warn_rl(root_dst->fs_info, "cannot deduplicate to root %llu while send operations are using it (%d in progress)", - root_dst->root_key.objectid, + btrfs_root_id(root_dst), root_dst->send_in_progress); spin_unlock(&root_dst->root_item_lock); return -EAGAIN; @@ -724,6 +699,7 @@ out: static noinline int btrfs_clone_files(struct file *file, struct file *file_src, u64 off, u64 olen, u64 destoff) { + struct extent_state *cached_state = NULL; struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -731,6 +707,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, int wb_ret; u64 len = olen; u64 bs = fs_info->sectorsize; + u64 end; /* * VFS's generic_remap_file_range_prep() protects us from cloning the @@ -763,12 +740,15 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, } /* - * Lock destination range to serialize with concurrent readahead() and - * source range to serialize with relocation. + * Lock destination range to serialize with concurrent readahead(), and + * we are safe from concurrency with relocation of source extents + * because we have already locked the inode's i_mmap_lock in exclusive + * mode. */ - btrfs_double_extent_lock(src, off, inode, destoff, len); + end = destoff + len - 1; + lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - btrfs_double_extent_unlock(src, off, inode, destoff, len); + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f96f267fb4aa..8b24bb5a0aa1 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -473,20 +473,19 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( struct btrfs_backref_node *node = NULL; struct btrfs_backref_edge *edge; int ret; - int err = 0; iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } node = btrfs_backref_alloc_node(cache, bytenr, level); if (!node) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -497,10 +496,9 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( do { ret = btrfs_backref_add_tree_node(trans, cache, path, iter, node_key, cur); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } + edge = list_first_entry_or_null(&cache->pending_edge, struct btrfs_backref_edge, list[UPPER]); /* @@ -515,10 +513,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( /* Finish the upper linkage of newly added edges/nodes */ ret = btrfs_backref_finish_upper_links(cache, node); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } if (handle_useless_nodes(rc, node)) node = NULL; @@ -526,9 +522,9 @@ out: btrfs_free_path(iter->path); kfree(iter); btrfs_free_path(path); - if (err) { + if (ret) { btrfs_backref_error_cleanup(cache, node); - return ERR_PTR(err); + return ERR_PTR(ret); } ASSERT(!node || !node->detached); ASSERT(list_empty(&cache->useless_node) && @@ -754,7 +750,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, root_key.type = BTRFS_ROOT_ITEM_KEY; root_key.offset = objectid; - if (root->root_key.objectid == objectid) { + if (btrfs_root_id(root) == objectid) { u64 commit_root_gen; /* called by btrfs_init_reloc_root */ @@ -798,7 +794,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); - if (root->root_key.objectid == objectid) { + if (btrfs_root_id(root) == objectid) { btrfs_set_root_refs(root_item, 0); memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); @@ -876,8 +872,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, * We are merging reloc roots, we do not need new reloc trees. Also * reloc trees never need their own reloc tree. */ - if (!rc->create_reloc_tree || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (!rc->create_reloc_tree || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) return 0; if (!trans->reloc_reserved) { @@ -885,7 +880,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, trans->block_rsv = rc->block_rsv; clear_rsv = 1; } - reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + reloc_root = create_reloc_root(trans, root, btrfs_root_id(root)); if (clear_rsv) trans->block_rsv = rsv; if (IS_ERR(reloc_root)) @@ -952,60 +947,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, } /* - * helper to find first cached inode with inode number >= objectid - * in a subvolume - */ -static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) -{ - struct rb_node *node; - struct rb_node *prev; - struct btrfs_inode *entry; - struct inode *inode; - - spin_lock(&root->inode_lock); -again: - node = root->inode_tree.rb_node; - prev = NULL; - while (node) { - prev = node; - entry = rb_entry(node, struct btrfs_inode, rb_node); - - if (objectid < btrfs_ino(entry)) - node = node->rb_left; - else if (objectid > btrfs_ino(entry)) - node = node->rb_right; - else - break; - } - if (!node) { - while (prev) { - entry = rb_entry(prev, struct btrfs_inode, rb_node); - if (objectid <= btrfs_ino(entry)) { - node = prev; - break; - } - prev = rb_next(prev); - } - } - while (node) { - entry = rb_entry(node, struct btrfs_inode, rb_node); - inode = igrab(&entry->vfs_inode); - if (inode) { - spin_unlock(&root->inode_lock); - return inode; - } - - objectid = btrfs_ino(entry) + 1; - if (cond_resched_lock(&root->inode_lock)) - goto again; - - node = rb_next(node); - } - spin_unlock(&root->inode_lock); - return NULL; -} - -/* * get new location of data */ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, @@ -1065,7 +1006,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_file_extent_item *fi; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; u64 parent; u64 bytenr; u64 new_bytenr = 0; @@ -1081,7 +1022,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, return 0; /* reloc trees always use full backref */ - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) parent = leaf->start; else parent = 0; @@ -1110,15 +1051,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, * if we are modifying block in fs tree, wait for read_folio * to complete and drop the extent cache */ - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { if (first) { - inode = find_next_inode(root, key.objectid); + inode = btrfs_find_first_inode(root, key.objectid); first = 0; - } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { - btrfs_add_delayed_iput(BTRFS_I(inode)); - inode = find_next_inode(root, key.objectid); + } else if (inode && btrfs_ino(inode) < key.objectid) { + btrfs_add_delayed_iput(inode); + inode = btrfs_find_first_inode(root, key.objectid); } - if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { + if (inode && btrfs_ino(inode) == key.objectid) { struct extent_state *cached_state = NULL; end = key.offset + @@ -1127,16 +1068,20 @@ int replace_file_extents(struct btrfs_trans_handle *trans, fs_info->sectorsize)); WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, - &cached_state); - if (!ret) + /* Take mmap lock to serialize with reflinks. */ + if (!down_read_trylock(&inode->i_mmap_lock)) + continue; + ret = try_lock_extent(&inode->io_tree, key.offset, + end, &cached_state); + if (!ret) { + up_read(&inode->i_mmap_lock); continue; + } - btrfs_drop_extent_map_range(BTRFS_I(inode), - key.offset, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, &cached_state); + btrfs_drop_extent_map_range(inode, key.offset, end, true); + unlock_extent(&inode->io_tree, key.offset, end, + &cached_state); + up_read(&inode->i_mmap_lock); } } @@ -1154,22 +1099,28 @@ int replace_file_extents(struct btrfs_trans_handle *trans, dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - num_bytes, parent, root->root_key.objectid); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset, - root->root_key.objectid, false); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = num_bytes; + ref.parent = parent; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_header_owner(leaf); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, parent, root->root_key.objectid); - btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset, - root->root_key.objectid, false); + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = bytenr; + ref.num_bytes = num_bytes; + ref.parent = parent; + ref.owning_root = btrfs_root_id(root); + ref.ref_root = btrfs_header_owner(leaf); + btrfs_init_data_ref(&ref, key.objectid, key.offset, + btrfs_root_id(root), false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1179,7 +1130,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (dirty) btrfs_mark_buffer_dirty(trans, leaf); if (inode) - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); return ret; } @@ -1225,8 +1176,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, int ret; int slot; - ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(src) == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(dest) != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: @@ -1378,20 +1329,26 @@ again: path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(trans, path->nodes[level]); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, - blocksize, path->nodes[level]->start, - src->root_key.objectid); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, - 0, true); + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = old_bytenr; + ref.num_bytes = blocksize; + ref.parent = path->nodes[level]->start; + ref.owning_root = btrfs_root_id(src); + ref.ref_root = btrfs_root_id(src); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - blocksize, 0, dest->root_key.objectid); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, - true); + + ref.action = BTRFS_ADD_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = blocksize; + ref.parent = 0; + ref.owning_root = btrfs_root_id(dest); + ref.ref_root = btrfs_root_id(dest); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1399,10 +1356,13 @@ again: } /* We don't know the real owning_root, use 0. */ - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, - blocksize, path->nodes[level]->start, 0); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, - 0, true); + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = new_bytenr; + ref.num_bytes = blocksize; + ref.parent = path->nodes[level]->start; + ref.owning_root = 0; + ref.ref_root = btrfs_root_id(src); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1410,10 +1370,13 @@ again: } /* We don't know the real owning_root, use 0. */ - btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, - blocksize, 0, 0); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, - 0, true); + ref.action = BTRFS_DROP_DELAYED_REF; + ref.bytenr = old_bytenr; + ref.num_bytes = blocksize; + ref.parent = 0; + ref.owning_root = 0; + ref.ref_root = btrfs_root_id(dest); + btrfs_init_tree_ref(&ref, level - 1, 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1521,7 +1484,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, const struct btrfs_key *max_key) { struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; u64 objectid; u64 start, end; u64 ino; @@ -1531,23 +1494,24 @@ static int invalidate_extent_cache(struct btrfs_root *root, struct extent_state *cached_state = NULL; cond_resched(); - iput(inode); + if (inode) + iput(&inode->vfs_inode); if (objectid > max_key->objectid) break; - inode = find_next_inode(root, objectid); + inode = btrfs_find_first_inode(root, objectid); if (!inode) break; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); if (ino > max_key->objectid) { - iput(inode); + iput(&inode->vfs_inode); break; } objectid = ino + 1; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->vfs_inode.i_mode)) continue; if (unlikely(min_key->objectid == ino)) { @@ -1580,9 +1544,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); + lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_drop_extent_map_range(inode, start, end, true); + unlock_extent(&inode->io_tree, start, end, &cached_state); } return 0; } @@ -1617,7 +1581,7 @@ static int insert_dirty_subvol(struct btrfs_trans_handle *trans, int ret; /* @root must be a subvolume tree root with a valid reloc tree */ - ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); ASSERT(reloc_root); reloc_root_item = &reloc_root->root_item; @@ -1646,7 +1610,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots, reloc_dirty_list) { - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) { /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; @@ -1921,13 +1885,13 @@ again: if (root->reloc_root) { btrfs_err(fs_info, "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu", - root->root_key.objectid, - root->reloc_root->root_key.objectid, + btrfs_root_id(root), + btrfs_root_id(root->reloc_root), root->reloc_root->root_key.type, root->reloc_root->root_key.offset, btrfs_root_generation( &root->reloc_root->root_item), - reloc_root->root_key.objectid, + btrfs_root_id(reloc_root), reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( @@ -1935,8 +1899,8 @@ again: } else { btrfs_err(fs_info, "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu", - root->root_key.objectid, - reloc_root->root_key.objectid, + btrfs_root_id(root), + btrfs_root_id(reloc_root), reloc_root->root_key.type, reloc_root->root_key.offset, btrfs_root_generation( @@ -2193,7 +2157,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, return ERR_PTR(-EUCLEAN); } - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { ret = record_reloc_root_in_trans(trans, root); if (ret) return ERR_PTR(ret); @@ -2300,7 +2264,7 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return root; - if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID) fs_root = root; if (next != node) @@ -2316,9 +2280,8 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) return fs_root; } -static noinline_for_stack -u64 calcu_metadata_size(struct reloc_control *rc, - struct btrfs_backref_node *node, int reserve) +static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, + struct btrfs_backref_node *node) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_backref_node *next = node; @@ -2327,12 +2290,12 @@ u64 calcu_metadata_size(struct reloc_control *rc, u64 num_bytes = 0; int index = 0; - BUG_ON(reserve && node->processed); + BUG_ON(node->processed); while (next) { cond_resched(); while (1) { - if (next->processed && (reserve || next != node)) + if (next->processed) break; num_bytes += fs_info->nodesize; @@ -2360,7 +2323,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, int ret; u64 tmp; - num_bytes = calcu_metadata_size(rc, node, 1) * 2; + num_bytes = calcu_metadata_size(rc, node) * 2; trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; @@ -2423,8 +2386,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { - struct btrfs_ref ref = { 0 }; - cond_resched(); upper = edge->node[UPPER]; @@ -2512,19 +2473,23 @@ static int do_relocation(struct btrfs_trans_handle *trans, */ ASSERT(node->eb == eb); } else { + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = node->eb->start, + .num_bytes = blocksize, + .parent = upper->eb->start, + .owning_root = btrfs_header_owner(upper->eb), + .ref_root = btrfs_header_owner(upper->eb), + }; + btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); btrfs_set_node_ptr_generation(upper->eb, slot, trans->transid); btrfs_mark_buffer_dirty(trans, upper->eb); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - node->eb->start, blocksize, - upper->eb->start, - btrfs_header_owner(upper->eb)); btrfs_init_tree_ref(&ref, node->level, - btrfs_header_owner(upper->eb), - root->root_key.objectid, false); + btrfs_root_id(root), false); ret = btrfs_inc_extent_ref(trans, &ref); if (!ret) ret = btrfs_drop_subtree(trans, root, eb, @@ -2776,12 +2741,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct tree_block *block; struct tree_block *next; - int ret; - int err = 0; + int ret = 0; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out_free_blocks; } @@ -2796,8 +2760,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Get first keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) { - err = get_tree_block_key(fs_info, block); - if (err) + ret = get_tree_block_key(fs_info, block); + if (ret) goto out_free_path; } } @@ -2807,25 +2771,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { - err = PTR_ERR(node); + ret = PTR_ERR(node); goto out; } ret = relocate_tree_block(trans, rc, node, &block->key, path); - if (ret < 0) { - err = ret; + if (ret < 0) break; - } } out: - err = finish_pending_nodes(trans, rc, path, err); + ret = finish_pending_nodes(trans, rc, path, ret); out_free_path: btrfs_free_path(path); out_free_blocks: free_block_list(blocks); - return err; + return ret; } static noinline_for_stack int prealloc_file_extent_cluster( @@ -2850,7 +2812,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * btrfs_do_readpage() call of previously relocated file cluster. * * If the current cluster starts in the above range, btrfs_do_readpage() - * will skip the read, and relocate_one_page() will later writeback + * will skip the read, and relocate_one_folio() will later writeback * the padding zeros as new data, causing data corruption. * * Here we have to manually invalidate the range (i_size, PAGE_END + 1). @@ -2859,7 +2821,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( struct address_space *mapping = inode->vfs_inode.i_mapping; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; - struct page *page; + struct folio *folio; ASSERT(sectorsize < PAGE_SIZE); ASSERT(IS_ALIGNED(i_size, sectorsize)); @@ -2890,16 +2852,16 @@ static noinline_for_stack int prealloc_file_extent_cluster( clear_extent_bits(&inode->io_tree, i_size, round_up(i_size, PAGE_SIZE) - 1, EXTENT_UPTODATE); - page = find_lock_page(mapping, i_size >> PAGE_SHIFT); + folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we * will re-read the whole page anyway. */ - if (page) { - btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size, + if (!IS_ERR(folio)) { + btrfs_subpage_clear_uptodate(fs_info, folio, i_size, round_up(i_size, PAGE_SIZE) - i_size); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } @@ -2984,68 +2946,71 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, return cluster->boundary[cluster_nr + 1] - 1; } -static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, - const struct file_extent_cluster *cluster, - int *cluster_nr, unsigned long page_index) +static int relocate_one_folio(struct inode *inode, struct file_ra_state *ra, + const struct file_extent_cluster *cluster, + int *cluster_nr, unsigned long index) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 offset = BTRFS_I(inode)->index_cnt; const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - struct page *page; - u64 page_start; - u64 page_end; + struct folio *folio; + u64 folio_start; + u64 folio_end; u64 cur; int ret; - ASSERT(page_index <= last_index); - page = find_lock_page(inode->i_mapping, page_index); - if (!page) { + ASSERT(index <= last_index); + folio = filemap_lock_folio(inode->i_mapping, index); + if (IS_ERR(folio)) { page_cache_sync_readahead(inode->i_mapping, ra, NULL, - page_index, last_index + 1 - page_index); - page = find_or_create_page(inode->i_mapping, page_index, mask); - if (!page) - return -ENOMEM; + index, last_index + 1 - index); + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); + if (IS_ERR(folio)) + return PTR_ERR(folio); } - if (PageReadahead(page)) + WARN_ON(folio_order(folio)); + + if (folio_test_readahead(folio)) page_cache_async_readahead(inode->i_mapping, ra, NULL, - page_folio(page), page_index, - last_index + 1 - page_index); + folio, index, + last_index + 1 - index); - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (!folio_test_uptodate(folio)) { ret = -EIO; - goto release_page; + goto release_folio; } } /* - * We could have lost page private when we dropped the lock to read the - * page above, make sure we set_page_extent_mapped here so we have any + * We could have lost folio private when we dropped the lock to read the + * folio above, make sure we set_page_extent_mapped here so we have any * of the subpage blocksize stuff we need in place. */ - ret = set_page_extent_mapped(page); + ret = set_folio_extent_mapped(folio); if (ret < 0) - goto release_page; + goto release_folio; - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; + folio_start = folio_pos(folio); + folio_end = folio_start + PAGE_SIZE - 1; /* * Start from the cluster, as for subpage case, the cluster can start - * inside the page. + * inside the folio. */ - cur = max(page_start, cluster->boundary[*cluster_nr] - offset); - while (cur <= page_end) { + cur = max(folio_start, cluster->boundary[*cluster_nr] - offset); + while (cur <= folio_end) { struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; - u64 clamped_start = max(page_start, extent_start); - u64 clamped_end = min(page_end, extent_end); + u64 clamped_start = max(folio_start, extent_start); + u64 clamped_end = min(folio_end, extent_end); u32 clamped_len = clamped_end + 1 - clamped_start; /* Reserve metadata for this range */ @@ -3053,7 +3018,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len, clamped_len, false); if (ret) - goto release_page; + goto release_folio; /* Mark the range delalloc and dirty for later writeback */ lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, @@ -3069,20 +3034,18 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); - goto release_page; + goto release_folio; } - btrfs_folio_set_dirty(fs_info, page_folio(page), - clamped_start, clamped_len); + btrfs_folio_set_dirty(fs_info, folio, clamped_start, clamped_len); /* - * Set the boundary if it's inside the page. + * Set the boundary if it's inside the folio. * Data relocation requires the destination extents to have the * same size as the source. * EXTENT_BOUNDARY bit prevents current extent from being merged * with previous extent. */ - if (in_range(cluster->boundary[*cluster_nr] - offset, - page_start, PAGE_SIZE)) { + if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) { u64 boundary_start = cluster->boundary[*cluster_nr] - offset; u64 boundary_end = boundary_start + @@ -3105,8 +3068,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, break; } } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); @@ -3114,9 +3077,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, ret = -ECANCELED; return ret; -release_page: - unlock_page(page); - put_page(page); +release_folio: + folio_unlock(folio); + folio_put(folio); return ret; } @@ -3151,7 +3114,7 @@ static int relocate_file_extent_cluster(struct inode *inode, last_index = (cluster->end - offset) >> PAGE_SHIFT; for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) - ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); + ret = relocate_one_folio(inode, ra, cluster, &cluster_nr, index); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3928,7 +3891,7 @@ static noinline_for_stack struct inode *create_reloc_inode( struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; - int err = 0; + int ret = 0; root = btrfs_grab_root(fs_info->data_reloc_root); trans = btrfs_start_transaction(root, 6); @@ -3937,31 +3900,31 @@ static noinline_for_stack struct inode *create_reloc_inode( return ERR_CAST(trans); } - err = btrfs_get_free_objectid(root, &objectid); - if (err) + ret = btrfs_get_free_objectid(root, &objectid); + if (ret) goto out; - err = __insert_orphan_inode(trans, root, objectid); - if (err) + ret = __insert_orphan_inode(trans, root, objectid); + if (ret) goto out; inode = btrfs_iget(fs_info->sb, objectid, root); if (IS_ERR(inode)) { delete_orphan_inode(trans, root, objectid); - err = PTR_ERR(inode); + ret = PTR_ERR(inode); inode = NULL; goto out; } BTRFS_I(inode)->index_cnt = group->start; - err = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - if (err) { + if (ret) { iput(inode); - inode = ERR_PTR(err); + inode = ERR_PTR(ret); } return inode; } @@ -4439,9 +4402,11 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, disk_bytenr + ordered->num_bytes - 1, - &list, 0, false); - if (ret) + &list, false); + if (ret < 0) { + btrfs_mark_ordered_extent_error(ordered); return ret; + } while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = @@ -4491,8 +4456,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, btrfs_root_last_snapshot(&root->root_item)) first_cow = 1; - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && - rc->create_reloc_tree) { + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID && rc->create_reloc_tree) { WARN_ON(!first_cow && level == 0); node = rc->backref_cache.path[level]; @@ -4585,8 +4549,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, } new_root = pending->snap; - reloc_root = create_reloc_root(trans, root->reloc_root, - new_root->root_key.objectid); + reloc_root = create_reloc_root(trans, root->reloc_root, btrfs_root_id(new_root)); if (IS_ERR(reloc_root)) return PTR_ERR(reloc_root); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 7007f9e0c972..33962671a96c 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -148,8 +148,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret > 0) { btrfs_crit(fs_info, "unable to find root key (%llu %u %llu) in tree %llu", - key->objectid, key->type, key->offset, - root->root_key.objectid); + key->objectid, key->type, key->offset, btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); goto out; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 50b4a76ac88e..3dd4a48479a9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -392,9 +392,8 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, btrfs_err(sctx->send_root->fs_info, "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", result_string, what, sctx->cmp_key->objectid, - sctx->send_root->root_key.objectid, - (sctx->parent_root ? - sctx->parent_root->root_key.objectid : 0)); + btrfs_root_id(sctx->send_root), + (sctx->parent_root ? btrfs_root_id(sctx->parent_root) : 0)); } __maybe_unused @@ -1316,9 +1315,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) u64 root = (u64)(uintptr_t)key; const struct clone_root *cr = elt; - if (root < cr->root->root_key.objectid) + if (root < btrfs_root_id(cr->root)) return -1; - if (root > cr->root->root_key.objectid) + if (root > btrfs_root_id(cr->root)) return 1; return 0; } @@ -1328,9 +1327,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) const struct clone_root *cr1 = e1; const struct clone_root *cr2 = e2; - if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) + if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root)) return -1; - if (cr1->root->root_key.objectid > cr2->root->root_key.objectid) + if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root)) return 1; return 0; } @@ -1778,7 +1777,7 @@ static int read_symlink(struct btrfs_root *root, */ btrfs_err(root->fs_info, "Found empty symlink inode %llu at root %llu", - ino, root->root_key.objectid); + ino, btrfs_root_id(root)); ret = -EIO; goto out; } @@ -2532,7 +2531,7 @@ static int send_subvol_begin(struct send_ctx *sctx) return -ENOMEM; } - key.objectid = send_root->root_key.objectid; + key.objectid = btrfs_root_id(send_root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; @@ -2548,7 +2547,7 @@ static int send_subvol_begin(struct send_ctx *sctx) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || - key.objectid != send_root->root_key.objectid) { + key.objectid != btrfs_root_id(send_root)) { ret = -ENOENT; goto out; } @@ -5274,10 +5273,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct page *page; + struct folio *folio; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); + struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; ret = put_data_header(sctx, len); @@ -5290,44 +5290,44 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_lock_page(sctx->cur_inode->i_mapping, index); - if (!page) { - page_cache_sync_readahead(sctx->cur_inode->i_mapping, + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { + page_cache_sync_readahead(mapping, &sctx->ra, NULL, index, last_index + 1 - index); - page = find_or_create_page(sctx->cur_inode->i_mapping, - index, GFP_KERNEL); - if (!page) { - ret = -ENOMEM; + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; } } - if (PageReadahead(page)) - page_cache_async_readahead(sctx->cur_inode->i_mapping, - &sctx->ra, NULL, page_folio(page), + WARN_ON(folio_order(folio)); + + if (folio_test_readahead(folio)) + page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, index, last_index + 1 - index); - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); + if (!folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); btrfs_err(fs_info, "send: IO error at offset %llu for inode %llu root %llu", - page_offset(page), sctx->cur_ino, - sctx->send_root->root_key.objectid); - put_page(page); + folio_pos(folio), sctx->cur_ino, + btrfs_root_id(sctx->send_root)); + folio_put(folio); ret = -EIO; break; } } - memcpy_from_page(sctx->send_buf + sctx->send_size, page, - pg_offset, cur_len); - unlock_page(page); - put_page(page); + memcpy_from_folio(sctx->send_buf + sctx->send_size, folio, + pg_offset, cur_len); + folio_unlock(folio); + folio_put(folio); index++; pg_offset = 0; len -= cur_len; @@ -5388,7 +5388,7 @@ static int send_clone(struct send_ctx *sctx, btrfs_debug(sctx->send_root->fs_info, "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, clone_root->root->root_key.objectid, + offset, len, btrfs_root_id(clone_root->root), clone_root->ino, clone_root->offset); p = fs_path_alloc(); @@ -7337,7 +7337,7 @@ static int search_key_again(const struct send_ctx *sctx, "send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", key->objectid, key->type, key->offset, (root == sctx->parent_root ? "parent" : "send"), - root->root_key.objectid, path->lowest_level, + btrfs_root_id(root), path->lowest_level, path->slots[path->lowest_level]); return -EUCLEAN; } @@ -8071,7 +8071,7 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) if (root->send_in_progress < 0) btrfs_err(root->fs_info, "send_in_progress unbalanced %d root %llu", - root->send_in_progress, root->root_key.objectid); + root->send_in_progress, btrfs_root_id(root)); spin_unlock(&root->root_item_lock); } @@ -8079,7 +8079,7 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) { btrfs_warn_rl(root->fs_info, "cannot use root %llu for send while deduplications on it are in progress (%d in progress)", - root->root_key.objectid, root->dedupe_in_progress); + btrfs_root_id(root), root->dedupe_in_progress); } long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7e44ccaf348f..2dbc930a20f7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1097,10 +1097,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) #endif if (btrfs_test_opt(info, REF_VERIFY)) seq_puts(seq, ",ref_verify"); - seq_printf(seq, ",subvolid=%llu", - BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_printf(seq, ",subvolid=%llu", btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); subvol_name = btrfs_get_subvol_name_from_objectid(info, - BTRFS_I(d_inode(dentry))->root->root_key.objectid); + btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); if (!IS_ERR(subvol_name)) { seq_puts(seq, ",subvol="); seq_escape(seq, subvol_name, " \t\n\\"); @@ -1152,7 +1151,7 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, struct super_block *s = root->d_sb; struct btrfs_fs_info *fs_info = btrfs_sb(s); struct inode *root_inode = d_inode(root); - u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + u64 root_objectid = btrfs_root_id(BTRFS_I(root_inode)->root); ret = 0; if (!is_subvolume_inode(root_inode)) { @@ -1774,10 +1773,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= - BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32; - buf->f_fsid.val[1] ^= - BTRFS_I(d_inode(dentry))->root->root_key.objectid; + buf->f_fsid.val[0] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root) >> 32; + buf->f_fsid.val[1] ^= btrfs_root_id(BTRFS_I(d_inode(dentry))->root); return 0; } @@ -2374,6 +2371,24 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) return 0; } +static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps); + + trace_btrfs_extent_map_shrinker_count(fs_info, nr); + + return nr; +} + +static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) +{ + const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + return btrfs_free_extent_maps(fs_info, nr_to_scan); +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -2387,6 +2402,8 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .freeze_fs = btrfs_freeze, .unfreeze_fs = btrfs_unfreeze, + .nr_cached_objects = btrfs_nr_cached_objects, + .free_cached_objects = btrfs_free_cached_objects, }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c6387a8ddb94..af545b6b1190 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -2339,7 +2339,7 @@ int btrfs_sysfs_add_one_qgroup(struct btrfs_fs_info *fs_info, struct kobject *qgroups_kobj = fs_info->qgroups_kobj; int ret; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return 0; if (qgroup->kobj.state_initialized) return 0; @@ -2360,7 +2360,7 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info) struct btrfs_qgroup *qgroup; struct btrfs_qgroup *next; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return; rbtree_postorder_for_each_entry_safe(qgroup, next, @@ -2381,7 +2381,7 @@ int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info) struct btrfs_qgroup *next; int ret = 0; - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return 0; ASSERT(fsid_kobj); @@ -2413,7 +2413,7 @@ out: void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)) + if (btrfs_is_testing(fs_info)) return; if (qgroup->kobj.state_initialized) { diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 709c6cc9706a..dce0387ef155 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -160,8 +160,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) if (!fs_info) return; - if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, - &fs_info->fs_state))) + if (WARN_ON(!btrfs_is_testing(fs_info))) return; test_mnt->mnt_sb->s_fs_info = NULL; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 47b5d301038e..ba36794ba2d5 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -11,19 +11,22 @@ #include "../disk-io.h" #include "../block-group.h" -static void free_extent_map_tree(struct extent_map_tree *em_tree) +static int free_extent_map_tree(struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; struct rb_node *node; + int ret = 0; write_lock(&em_tree->lock); while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) { node = rb_first_cached(&em_tree->map); em = rb_entry(node, struct extent_map, rb_node); - remove_extent_mapping(em_tree, em); + remove_extent_mapping(inode, em); #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { + ret = -EINVAL; test_err( "em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d", em->start, em->len, em->block_start, @@ -35,6 +38,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) free_extent_map(em); } write_unlock(&em_tree->lock); + + return ret; } /* @@ -53,13 +58,14 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) * ->add_extent_mapping(0, 16K) * -> #handle -EEXIST */ -static int test_case_1(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 start = 0; u64 len = SZ_8K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -73,7 +79,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); @@ -94,7 +100,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); @@ -115,7 +121,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = start; em->block_len = len; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret) { test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); @@ -137,7 +143,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -148,11 +156,12 @@ out: * Reading the inline ending up with EEXIST, ie. read an inline * extent and discard page cache and read it again. */ -static int test_case_2(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -166,7 +175,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); @@ -187,7 +196,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -208,7 +217,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret) { test_err("case2 [0 1K]: ret %d", ret); @@ -229,17 +238,21 @@ static int test_case_2(struct btrfs_fs_info *fs_info, } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } static int __test_case_3(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, u64 start) + struct btrfs_inode *inode, u64 start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 len = SZ_4K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -253,7 +266,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -274,7 +287,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); if (ret) { test_err("case3 [%llu %llu): ret %d", @@ -301,7 +314,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -322,28 +337,29 @@ out: * -> add_extent_mapping() * -> add_extent_mapping() */ -static int test_case_3(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_3(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { int ret; - ret = __test_case_3(fs_info, em_tree, 0); + ret = __test_case_3(fs_info, inode, 0); if (ret) return ret; - ret = __test_case_3(fs_info, em_tree, SZ_8K); + ret = __test_case_3(fs_info, inode, SZ_8K); if (ret) return ret; - ret = __test_case_3(fs_info, em_tree, (12 * SZ_1K)); + ret = __test_case_3(fs_info, inode, (12 * SZ_1K)); return ret; } static int __test_case_4(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, u64 start) + struct btrfs_inode *inode, u64 start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 len = SZ_4K; int ret; + int ret2; em = alloc_extent_map(); if (!em) { @@ -357,7 +373,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_8K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); @@ -378,7 +394,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = SZ_16K; /* avoid merging */ em->block_len = 24 * SZ_1K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); @@ -398,7 +414,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_32K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(inode, &em, start, len); write_unlock(&em_tree->lock); if (ret) { test_err("case4 [%llu %llu): ret %d", @@ -420,7 +436,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, } free_extent_map(em); out: - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; return ret; } @@ -450,23 +468,22 @@ out: * # handle -EEXIST when adding * # [0, 32K) */ -static int test_case_4(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree) +static int test_case_4(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { int ret; - ret = __test_case_4(fs_info, em_tree, 0); + ret = __test_case_4(fs_info, inode, 0); if (ret) return ret; - ret = __test_case_4(fs_info, em_tree, SZ_4K); + ret = __test_case_4(fs_info, inode, SZ_4K); return ret; } -static int add_compressed_extent(struct btrfs_fs_info *fs_info, - struct extent_map_tree *em_tree, +static int add_compressed_extent(struct btrfs_inode *inode, u64 start, u64 len, u64 block_start) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; @@ -482,7 +499,7 @@ static int add_compressed_extent(struct btrfs_fs_info *fs_info, em->block_len = SZ_4K; em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); free_extent_map(em); if (ret < 0) { @@ -588,53 +605,44 @@ static int validate_range(struct extent_map_tree *em_tree, int index) * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from * merging the em's. */ -static int test_case_5(struct btrfs_fs_info *fs_info) +static int test_case_5(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { - struct extent_map_tree *em_tree; - struct inode *inode; u64 start, end; int ret; + int ret2; test_msg("Running btrfs_drop_extent_map_range tests"); - inode = btrfs_new_test_inode(); - if (!inode) { - test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; - } - - em_tree = &BTRFS_I(inode)->extent_tree; - /* [0, 12k) */ - ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0); + ret = add_compressed_extent(inode, 0, SZ_4K * 3, 0); if (ret) { test_err("cannot add extent range [0, 12K)"); goto out; } /* [12k, 24k) */ - ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + ret = add_compressed_extent(inode, SZ_4K * 3, SZ_4K * 3, SZ_4K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [24k, 36k) */ - ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + ret = add_compressed_extent(inode, SZ_4K * 6, SZ_4K * 3, SZ_8K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [36k, 40k) */ - ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + ret = add_compressed_extent(inode, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [40k, 64k) */ - ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + ret = add_compressed_extent(inode, SZ_4K * 10, SZ_4K * 6, SZ_16K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; @@ -643,36 +651,39 @@ static int test_case_5(struct btrfs_fs_info *fs_info) /* Drop [8k, 12k) */ start = SZ_8K; end = (3 * SZ_4K) - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 0); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 0); if (ret) goto out; /* Drop [12k, 20k) */ start = SZ_4K * 3; end = SZ_16K + SZ_4K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 1); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 1); if (ret) goto out; /* Drop [28k, 32k) */ start = SZ_32K - SZ_4K; end = SZ_32K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 2); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 2); if (ret) goto out; /* Drop [32k, 64k) */ start = SZ_32K; end = SZ_64K - 1; - btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); - ret = validate_range(&BTRFS_I(inode)->extent_tree, 3); + btrfs_drop_extent_map_range(inode, start, end, false); + ret = validate_range(&inode->extent_tree, 3); if (ret) goto out; out: - iput(inode); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -681,23 +692,26 @@ out: * for areas between two existing ems. Validate it doesn't do this when there * are two unmerged em's side by side. */ -static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) +static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em = NULL; int ret; + int ret2; - ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0); + ret = add_compressed_extent(inode, 0, SZ_4K, 0); if (ret) goto out; - ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0); + ret = add_compressed_extent(inode, SZ_4K, SZ_4K, 0); if (ret) goto out; em = alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); - return -ENOMEM; + ret = -ENOMEM; + goto out; } em->start = SZ_4K; @@ -705,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em em->block_start = SZ_16K; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K); + ret = btrfs_add_extent_mapping(inode, &em, 0, SZ_8K); write_unlock(&em_tree->lock); if (ret != 0) { @@ -725,7 +739,10 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em ret = 0; out: free_extent_map(em); - free_extent_map_tree(em_tree); + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -734,28 +751,19 @@ out: * true would mess up the start/end calculations and subsequent splits would be * incorrect. */ -static int test_case_7(struct btrfs_fs_info *fs_info) +static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { - struct extent_map_tree *em_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; - struct inode *inode; int ret; + int ret2; test_msg("Running btrfs_drop_extent_cache with pinned"); - inode = btrfs_new_test_inode(); - if (!inode) { - test_std_err(TEST_ALLOC_INODE); - return -ENOMEM; - } - - em_tree = &BTRFS_I(inode)->extent_tree; - em = alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); - ret = -ENOMEM; - goto out; + return -ENOMEM; } /* [0, 16K), pinned */ @@ -765,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info) em->block_len = SZ_4K; em->flags |= EXTENT_FLAG_PINNED; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -786,7 +794,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info) em->block_start = SZ_32K; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -798,7 +806,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info) * Drop [0, 36K) This should skip the [0, 4K) extent and then split the * [32K, 48K) extent. */ - btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true); + btrfs_drop_extent_map_range(inode, 0, (36 * SZ_1K) - 1, true); /* Make sure our extent maps look sane. */ ret = -EINVAL; @@ -865,7 +873,14 @@ static int test_case_7(struct btrfs_fs_info *fs_info) ret = 0; out: free_extent_map(em); - iput(inode); + /* Unpin our extent to prevent warning when removing it below. */ + ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0); + if (ret == 0) + ret = ret2; + ret2 = free_extent_map_tree(inode); + if (ret == 0) + ret = ret2; + return ret; } @@ -959,7 +974,8 @@ out_free: int btrfs_test_extent_map(void) { struct btrfs_fs_info *fs_info = NULL; - struct extent_map_tree *em_tree; + struct inode *inode; + struct btrfs_root *root = NULL; int ret = 0, i; struct rmap_test_vector rmap_tests[] = { { @@ -1008,33 +1024,42 @@ int btrfs_test_extent_map(void) return -ENOMEM; } - em_tree = kzalloc(sizeof(*em_tree), GFP_KERNEL); - if (!em_tree) { + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); ret = -ENOMEM; goto out; } - extent_map_tree_init(em_tree); + root = btrfs_alloc_dummy_root(fs_info); + if (IS_ERR(root)) { + test_std_err(TEST_ALLOC_ROOT); + ret = PTR_ERR(root); + root = NULL; + goto out; + } - ret = test_case_1(fs_info, em_tree); + BTRFS_I(inode)->root = root; + + ret = test_case_1(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_2(fs_info, em_tree); + ret = test_case_2(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_3(fs_info, em_tree); + ret = test_case_3(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_4(fs_info, em_tree); + ret = test_case_4(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_5(fs_info); + ret = test_case_5(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_6(fs_info, em_tree); + ret = test_case_6(fs_info, BTRFS_I(inode)); if (ret) goto out; - ret = test_case_7(fs_info); + ret = test_case_7(fs_info, BTRFS_I(inode)); if (ret) goto out; @@ -1046,7 +1071,8 @@ int btrfs_test_extent_map(void) } out: - kfree(em_tree); + iput(inode); + btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 85f359e0e0a7..3388c836b9a5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -426,7 +426,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, return 0; } radix_tree_tag_set(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; @@ -472,7 +472,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, /* Make sure we don't try to update the root at commit time */ spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); } @@ -550,7 +550,7 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) if (!fs_info->reloc_ctl || !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; @@ -1052,7 +1052,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - int err = 0; + int ret = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); @@ -1091,13 +1091,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) - err = trans->aborted; + ret = trans->aborted; else - err = -EROFS; + ret = -EROFS; } kmem_cache_free(btrfs_trans_handle_cachep, trans); - return err; + return ret; } int btrfs_end_transaction(struct btrfs_trans_handle *trans) @@ -1118,8 +1118,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { - int err = 0; - int werr = 0; + int ret = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; @@ -1129,7 +1128,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, mark, &cached_state)) { bool wait_writeback = false; - err = convert_extent_bit(dirty_pages, start, end, + ret = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, &cached_state); /* @@ -1145,22 +1144,22 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, * We cleanup any entries left in the io tree when committing * the transaction (through extent_io_tree_release()). */ - if (err == -ENOMEM) { - err = 0; + if (ret == -ENOMEM) { + ret = 0; wait_writeback = true; } - if (!err) - err = filemap_fdatawrite_range(mapping, start, end); - if (err) - werr = err; - else if (wait_writeback) - werr = filemap_fdatawait_range(mapping, start, end); + if (!ret) + ret = filemap_fdatawrite_range(mapping, start, end); + if (!ret && wait_writeback) + ret = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); + if (ret) + break; cached_state = NULL; cond_resched(); start = end + 1; } - return werr; + return ret; } /* @@ -1172,12 +1171,11 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { - int err = 0; - int werr = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + int ret = 0; while (find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { @@ -1189,22 +1187,20 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ - err = clear_extent_bit(dirty_pages, start, end, + ret = clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, &cached_state); - if (err == -ENOMEM) - err = 0; - if (!err) - err = filemap_fdatawait_range(mapping, start, end); - if (err) - werr = err; + if (ret == -ENOMEM) + ret = 0; + if (!ret) + ret = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); + if (ret) + break; cached_state = NULL; cond_resched(); start = end + 1; } - if (err) - werr = err; - return werr; + return ret; } static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, @@ -1229,7 +1225,7 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) bool errors = false; int err; - ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); err = __btrfs_wait_marked_extents(fs_info, dirty_pages); if ((mark & EXTENT_DIRTY) && @@ -1492,7 +1488,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) ASSERT(atomic_read(&root->log_commit[1]) == 0); radix_tree_tag_clear(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, + (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1583,8 +1579,8 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, goto out; /* Now qgroup are all updated, we can inherit it to new qgroups */ - ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, - parent->root_key.objectid, inherit); + ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid, + btrfs_root_id(parent), inherit); if (ret < 0) goto out; @@ -1822,7 +1818,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert root back/forward references */ ret = btrfs_add_root_ref(trans, objectid, - parent_root->root_key.objectid, + btrfs_root_id(parent_root), btrfs_ino(BTRFS_I(parent_inode)), index, &fname.disk_name); if (ret) { @@ -1855,16 +1851,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = qgroup_account_snapshot(trans, root, parent_root, pending->inherit, objectid); else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) - ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid, - parent_root->root_key.objectid, pending->inherit); + ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid, + btrfs_root_id(parent_root), pending->inherit); if (ret < 0) goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, index); - /* We have check then name at the beginning, so it is impossible. */ - BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -2625,7 +2619,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); + btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root)); btrfs_kill_all_delayed_nodes(root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 32604e9b31c3..a2c3651a3d8f 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -2021,7 +2021,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) * Skip dummy fs, as selftests don't create unique ebs for each dummy * root. */ - if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) + if (btrfs_is_testing(eb->fs_info)) return 0; /* * There are several call sites (backref walking, qgroup, and data diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 472918a5bc73..5146387b416b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -391,7 +391,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, * the leaf before writing into the log tree. See the comments at * copy_items() for more details. */ - ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); @@ -748,7 +748,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; if (ins.objectid > 0) { - struct btrfs_ref ref = { 0 }; u64 csum_start; u64 csum_end; LIST_HEAD(ordered_sums); @@ -762,13 +761,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret < 0) { goto out; } else if (ret == 0) { - btrfs_init_generic_ref(&ref, - BTRFS_ADD_DELAYED_REF, - ins.objectid, ins.offset, 0, - root->root_key.objectid); - btrfs_init_data_ref(&ref, - root->root_key.objectid, - key->objectid, offset, 0, false); + struct btrfs_ref ref = { + .action = BTRFS_ADD_DELAYED_REF, + .bytenr = ins.objectid, + .num_bytes = ins.offset, + .owning_root = btrfs_root_id(root), + .ref_root = btrfs_root_id(root), + }; + btrfs_init_data_ref(&ref, key->objectid, offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; @@ -778,7 +779,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * allocation tree */ ret = btrfs_alloc_logged_file_extent(trans, - root->root_key.objectid, + btrfs_root_id(root), key->objectid, offset, &ins); if (ret) goto out; @@ -797,9 +798,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, - &ordered_sums, 0, false); - if (ret) + &ordered_sums, false); + if (ret < 0) goto out; + ret = 0; /* * Now delete all existing cums in the csum root that * cover our range. We do this because we can have an @@ -3045,7 +3047,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret != -ENOSPC) btrfs_err(fs_info, "failed to update log for root %llu ret %d", - root->root_key.objectid, ret); + btrfs_root_id(root), ret); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); goto out; @@ -4460,9 +4462,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, disk_bytenr += extent_offset; ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, disk_bytenr + extent_num_bytes - 1, - &ordered_sums, 0, false); - if (ret) + &ordered_sums, false); + if (ret < 0) goto out; + ret = 0; list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { if (!ret) @@ -4574,8 +4577,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_root *csum_root; u64 csum_offset; u64 csum_len; - u64 mod_start = em->mod_start; - u64 mod_len = em->mod_len; + u64 mod_start = em->start; + u64 mod_len = em->len; LIST_HEAD(ordered_sums); int ret = 0; @@ -4655,9 +4658,10 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, em->block_start); ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0, false); - if (ret) + csum_len - 1, &ordered_sums, false); + if (ret < 0) return ret; + ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, @@ -4945,7 +4949,7 @@ process: * private list. */ if (ret) { - clear_em_logging(tree, em); + clear_em_logging(inode, em); free_extent_map(em); continue; } @@ -4954,7 +4958,7 @@ process: ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); - clear_em_logging(tree, em); + clear_em_logging(inode, em); free_extent_map(em); } WARN_ON(!list_empty(&extents)); diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 43b3accbed7a..fa45b5fb9683 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -1004,7 +1004,7 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); check.level = level; - check.owner_root = root->root_key.objectid; + check.owner_root = btrfs_root_id(root); old = read_tree_block(fs_info, logical, &check); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ef6bd2f4251b..b6a701011fb0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5615,21 +5615,6 @@ struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) return map; } -struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp) -{ - const int size = btrfs_chunk_map_size(map->num_stripes); - struct btrfs_chunk_map *clone; - - clone = kmemdup(map, size, gfp); - if (!clone) - return NULL; - - refcount_set(&clone->refs, 1); - RB_CLEAR_NODE(&clone->rb_node); - - return clone; -} - static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 93854609a4d5..66e6fc481ecd 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -92,6 +92,9 @@ enum btrfs_raid_types { #define BTRFS_DEV_STATE_FLUSH_SENT (4) #define BTRFS_DEV_STATE_NO_READA (5) +/* Special value encoding failure to write primary super block. */ +#define BTRFS_SUPER_PRIMARY_WRITE_ERROR (INT_MAX / 2) + struct btrfs_fs_devices; struct btrfs_device { @@ -142,6 +145,12 @@ struct btrfs_device { /* type and info about this device */ u64 type; + /* + * Counter of super block write errors, values larger than + * BTRFS_SUPER_PRIMARY_WRITE_ERROR encode primary super block write failure. + */ + atomic_t sb_write_errors; + /* minimal io size for this device */ u32 sector_size; @@ -743,7 +752,6 @@ struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp); int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); #endif -struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp); struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 6287763fdccc..15d0999e340e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -504,7 +504,7 @@ static int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr; unsigned int nofs_flag; char *name; - int err = 0; + int ret = 0; /* * We're holding a transaction handle, so use a NOFS memory allocation @@ -515,7 +515,7 @@ static int btrfs_initxattrs(struct inode *inode, name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { - err = -ENOMEM; + ret = -ENOMEM; break; } strcpy(name, XATTR_SECURITY_PREFIX); @@ -524,14 +524,14 @@ static int btrfs_initxattrs(struct inode *inode, if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); - err = btrfs_setxattr(trans, inode, name, xattr->value, + ret = btrfs_setxattr(trans, inode, name, xattr->value, xattr->value_len, 0); kfree(name); - if (err < 0) + if (ret < 0) break; } memalloc_nofs_restore(nofs_flag); - return err; + return ret; } int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index e5b3f2003896..d9e5c88a0f85 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -91,24 +91,24 @@ fail: return ERR_PTR(-ENOMEM); } -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; char *data_in = NULL; - char *cpage_out; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; + char *cfolio_out; + int nr_folios = 0; + struct folio *in_folio = NULL; + struct folio *out_folio = NULL; unsigned long bytes_left; - unsigned int in_buf_pages; + unsigned int in_buf_folios; unsigned long len = *total_out; - unsigned long nr_dest_pages = *out_pages; - const unsigned long max_out = nr_dest_pages * PAGE_SIZE; + unsigned long nr_dest_folios = *out_folios; + const unsigned long max_out = nr_dest_folios * PAGE_SIZE; - *out_pages = 0; + *out_folios = 0; *total_out = 0; *total_in = 0; @@ -121,18 +121,18 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[0] = out_page; - nr_pages = 1; + cfolio_out = folio_address(out_folio); + folios[0] = out_folio; + nr_folios = 1; workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; workspace->strm.avail_out = PAGE_SIZE; while (workspace->strm.total_in < len) { @@ -142,19 +142,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, */ if (workspace->strm.avail_in == 0) { bytes_left = len - workspace->strm.total_in; - in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_pages > 1) { + in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_folios > 1) { int i; - for (i = 0; i < in_buf_pages; i++) { + for (i = 0; i < in_buf_folios; i++) { if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); + data_in = NULL; } - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + data_in = kmap_local_folio(in_folio, 0); copy_page(workspace->buf + i * PAGE_SIZE, data_in); start += PAGE_SIZE; @@ -163,11 +166,14 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, } else { if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); + data_in = NULL; } - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, + start, &in_folio); + if (ret < 0) + goto out; + data_in = kmap_local_folio(in_folio, 0); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,20 +202,20 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[nr_pages] = out_page; - nr_pages++; + cfolio_out = folio_address(out_folio); + folios[nr_folios] = out_folio; + nr_folios++; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; } /* we're all done */ if (workspace->strm.total_in >= len) @@ -231,21 +237,21 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -EIO; goto out; } else if (workspace->strm.avail_out == 0) { - /* get another page for the stream end */ - if (nr_pages == nr_dest_pages) { + /* Get another folio for the stream end. */ + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); - pages[nr_pages] = out_page; - nr_pages++; + cfolio_out = folio_address(out_folio); + folios[nr_folios] = out_folio; + nr_folios++; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.next_out = cpage_out; + workspace->strm.next_out = cfolio_out; } } zlib_deflateEnd(&workspace->strm); @@ -259,10 +265,10 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = workspace->strm.total_out; *total_in = workspace->strm.total_in; out: - *out_pages = nr_pages; + *out_folios = nr_folios; if (data_in) { kunmap_local(data_in); - put_page(in_page); + folio_put(in_folio); } return ret; @@ -275,13 +281,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; - unsigned long page_in_index = 0; + unsigned long folio_in_index = 0; size_t srclen = cb->compressed_len; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; - struct page **pages_in = cb->compressed_pages; + struct folio **folios_in = cb->compressed_folios; - data_in = kmap_local_page(pages_in[page_in_index]); + data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -331,12 +337,12 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; kunmap_local(data_in); - page_in_index++; - if (page_in_index >= total_pages_in) { + folio_in_index++; + if (folio_in_index >= total_folios_in) { data_in = NULL; break; } - data_in = kmap_local_page(pages_in[page_in_index]); + data_in = kmap_local_folio(folios_in[folio_in_index], 0); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 92b3744b819b..2b232b82c3a8 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -374,25 +374,25 @@ fail: return ERR_PTR(-ENOMEM); } -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out) +int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, + u64 start, struct folio **folios, unsigned long *out_folios, + unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); zstd_cstream *stream; int ret = 0; - int nr_pages = 0; - struct page *in_page = NULL; /* The current page to read */ - struct page *out_page = NULL; /* The current page to write to */ + int nr_folios = 0; + struct folio *in_folio = NULL; /* The current folio to read. */ + struct folio *out_folio = NULL; /* The current folio to write to. */ unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; - const unsigned long nr_dest_pages = *out_pages; - unsigned long max_out = nr_dest_pages * PAGE_SIZE; + const unsigned long nr_dest_folios = *out_folios; + unsigned long max_out = nr_dest_folios * PAGE_SIZE; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); - *out_pages = 0; + *out_folios = 0; *total_out = 0; *total_in = 0; @@ -406,19 +406,21 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, } /* map in the first page of input data */ - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + if (ret < 0) + goto out; + workspace->in_buf.src = kmap_local_folio(in_folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); /* Allocate and map in the output buffer */ - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -453,17 +455,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -479,11 +481,14 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; kunmap_local(workspace->in_buf.src); - put_page(in_page); + workspace->in_buf.src = NULL; + folio_put(in_folio); start += PAGE_SIZE; len -= PAGE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap_local_page(in_page); + ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); + if (ret < 0) + goto out; + workspace->in_buf.src = kmap_local_folio(in_folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -510,17 +515,17 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - if (nr_pages == nr_dest_pages) { + if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } - out_page = btrfs_alloc_compr_page(); - if (out_page == NULL) { + out_folio = btrfs_alloc_compr_folio(); + if (out_folio == NULL) { ret = -ENOMEM; goto out; } - pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + folios[nr_folios++] = out_folio; + workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -534,10 +539,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = tot_in; *total_out = tot_out; out: - *out_pages = nr_pages; + *out_folios = nr_folios; if (workspace->in_buf.src) { kunmap_local(workspace->in_buf.src); - put_page(in_page); + folio_put(in_folio); } return ret; } @@ -545,12 +550,12 @@ out: int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); - struct page **pages_in = cb->compressed_pages; + struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; - unsigned long page_in_index = 0; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long folio_in_index = 0; + unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; @@ -562,7 +567,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -599,14 +604,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); - page_in_index++; - if (page_in_index >= total_pages_in) { + folio_in_index++; + if (folio_in_index >= total_folios_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); + workspace->in_buf.src = + kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 90b0222390e5..d2d94d7c3fb5 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -16,8 +16,6 @@ struct extent_map; struct btrfs_file_extent_item; struct btrfs_ordered_extent; struct btrfs_delayed_ref_node; -struct btrfs_delayed_tree_ref; -struct btrfs_delayed_data_ref; struct btrfs_delayed_ref_head; struct btrfs_block_group; struct btrfs_free_cluster; @@ -277,8 +275,7 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict, { EXTENT_FLAG_COMPRESS_LZO, "COMPRESS_LZO" },\ { EXTENT_FLAG_COMPRESS_ZSTD, "COMPRESS_ZSTD" },\ { EXTENT_FLAG_PREALLOC, "PREALLOC" },\ - { EXTENT_FLAG_LOGGING, "LOGGING" },\ - { EXTENT_FLAG_FILLING, "FILLING" }) + { EXTENT_FLAG_LOGGING, "LOGGING" }) TRACE_EVENT_CONDITION(btrfs_get_extent, @@ -869,11 +866,9 @@ TRACE_EVENT(btrfs_add_block_group, DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, - const struct btrfs_delayed_tree_ref *full_ref, - int action), + const struct btrfs_delayed_ref_node *ref), - TP_ARGS(fs_info, ref, full_ref, action), + TP_ARGS(fs_info, ref), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) @@ -889,10 +884,10 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, TP_fast_assign_btrfs(fs_info, __entry->bytenr = ref->bytenr; __entry->num_bytes = ref->num_bytes; - __entry->action = action; - __entry->parent = full_ref->parent; - __entry->ref_root = full_ref->root; - __entry->level = full_ref->level; + __entry->action = ref->action; + __entry->parent = ref->parent; + __entry->ref_root = ref->ref_root; + __entry->level = ref->tree_ref.level; __entry->type = ref->type; __entry->seq = ref->seq; ), @@ -912,31 +907,25 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref, DEFINE_EVENT(btrfs_delayed_tree_ref, add_delayed_tree_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, - const struct btrfs_delayed_tree_ref *full_ref, - int action), + const struct btrfs_delayed_ref_node *ref), - TP_ARGS(fs_info, ref, full_ref, action) + TP_ARGS(fs_info, ref) ); DEFINE_EVENT(btrfs_delayed_tree_ref, run_delayed_tree_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, - const struct btrfs_delayed_tree_ref *full_ref, - int action), + const struct btrfs_delayed_ref_node *ref), - TP_ARGS(fs_info, ref, full_ref, action) + TP_ARGS(fs_info, ref) ); DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, - const struct btrfs_delayed_data_ref *full_ref, - int action), + const struct btrfs_delayed_ref_node *ref), - TP_ARGS(fs_info, ref, full_ref, action), + TP_ARGS(fs_info, ref), TP_STRUCT__entry_btrfs( __field( u64, bytenr ) @@ -953,11 +942,11 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, TP_fast_assign_btrfs(fs_info, __entry->bytenr = ref->bytenr; __entry->num_bytes = ref->num_bytes; - __entry->action = action; - __entry->parent = full_ref->parent; - __entry->ref_root = full_ref->root; - __entry->owner = full_ref->objectid; - __entry->offset = full_ref->offset; + __entry->action = ref->action; + __entry->parent = ref->parent; + __entry->ref_root = ref->ref_root; + __entry->owner = ref->data_ref.objectid; + __entry->offset = ref->data_ref.offset; __entry->type = ref->type; __entry->seq = ref->seq; ), @@ -979,21 +968,17 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref, DEFINE_EVENT(btrfs_delayed_data_ref, add_delayed_data_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, - const struct btrfs_delayed_data_ref *full_ref, - int action), + const struct btrfs_delayed_ref_node *ref), - TP_ARGS(fs_info, ref, full_ref, action) + TP_ARGS(fs_info, ref) ); DEFINE_EVENT(btrfs_delayed_data_ref, run_delayed_data_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, - const struct btrfs_delayed_ref_node *ref, - const struct btrfs_delayed_data_ref *full_ref, - int action), + const struct btrfs_delayed_ref_node *ref), - TP_ARGS(fs_info, ref, full_ref, action) + TP_ARGS(fs_info, ref) ); DECLARE_EVENT_CLASS(btrfs_delayed_ref_head, @@ -2552,6 +2537,105 @@ TRACE_EVENT(btrfs_get_raid_extent_offset, __entry->devid) ); +TRACE_EVENT(btrfs_extent_map_shrinker_count, + + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr), + + TP_ARGS(fs_info, nr), + + TP_STRUCT__entry_btrfs( + __field( long, nr ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->nr = nr; + ), + + TP_printk_btrfs("nr=%ld", __entry->nr) +); + +TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter, + + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr), + + TP_ARGS(fs_info, nr_to_scan, nr), + + TP_STRUCT__entry_btrfs( + __field( long, nr_to_scan ) + __field( long, nr ) + __field( u64, last_root_id ) + __field( u64, last_ino ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->nr_to_scan = nr_to_scan; + __entry->nr = nr; + __entry->last_root_id = fs_info->extent_map_shrinker_last_root; + __entry->last_ino = fs_info->extent_map_shrinker_last_ino; + ), + + TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", + __entry->nr_to_scan, __entry->nr, + show_root_type(__entry->last_root_id), __entry->last_ino) +); + +TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit, + + TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr), + + TP_ARGS(fs_info, nr_dropped, nr), + + TP_STRUCT__entry_btrfs( + __field( long, nr_dropped ) + __field( long, nr ) + __field( u64, last_root_id ) + __field( u64, last_ino ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->nr_dropped = nr_dropped; + __entry->nr = nr; + __entry->last_root_id = fs_info->extent_map_shrinker_last_root; + __entry->last_ino = fs_info->extent_map_shrinker_last_ino; + ), + + TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu", + __entry->nr_dropped, __entry->nr, + show_root_type(__entry->last_root_id), __entry->last_ino) +); + +TRACE_EVENT(btrfs_extent_map_shrinker_remove_em, + + TP_PROTO(const struct btrfs_inode *inode, const struct extent_map *em), + + TP_ARGS(inode, em), + + TP_STRUCT__entry_btrfs( + __field( u64, ino ) + __field( u64, root_id ) + __field( u64, start ) + __field( u64, len ) + __field( u64, block_start ) + __field( u32, flags ) + ), + + TP_fast_assign_btrfs(inode->root->fs_info, + __entry->ino = btrfs_ino(inode); + __entry->root_id = inode->root->root_key.objectid; + __entry->start = em->start; + __entry->len = em->len; + __entry->block_start = em->block_start; + __entry->flags = em->flags; + ), + + TP_printk_btrfs( +"ino=%llu root=%llu(%s) start=%llu len=%llu block_start=%llu(%s) flags=%s", + __entry->ino, show_root_type(__entry->root_id), + __entry->start, __entry->len, + show_map_type(__entry->block_start), + show_map_flags(__entry->flags)) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ |