diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-02 19:59:25 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-02 19:59:25 -0700 |
commit | f3cdc8ae116e27d84e1f33c7a2995960cebb73ac (patch) | |
tree | db3dbbbbf82b76590f601b5caee5de3bef151c4b | |
parent | 8eeae5bae1239c030ba0b34cac97ebd5e7ec1886 (diff) | |
parent | 2166e5edce9ac1edf3b113d6091ef72fcac2d6c4 (diff) |
Merge tag 'for-5.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"Highlights:
- speedup dead root detection during orphan cleanup, eg. when there
are many deleted subvolumes waiting to be cleaned, the trees are
now looked up in radix tree instead of a O(N^2) search
- snapshot creation with inherited qgroup will mark the qgroup
inconsistent, requires a rescan
- send will emit file capabilities after chown, this produces a
stream that does not need postprocessing to set the capabilities
again
- direct io ported to iomap infrastructure, cleaned up and simplified
code, notably removing last use of struct buffer_head in btrfs code
Core changes:
- factor out backreference iteration, to be used by ordinary
backreferences and relocation code
- improved global block reserve utilization
* better logic to serialize requests
* increased maximum available for unlink
* improved handling on large pages (64K)
- direct io cleanups and fixes
* simplify layering, where cloned bios were unnecessarily created
for some cases
* error handling fixes (submit, endio)
* remove repair worker thread, used to avoid deadlocks during
repair
- refactored block group reading code, preparatory work for new type
of block group storage that should improve mount time on large
filesystems
Cleanups:
- cleaned up (and slightly sped up) set/get helpers for metadata data
structure members
- root bit REF_COWS got renamed to SHAREABLE to reflect the that the
blocks of the tree get shared either among subvolumes or with the
relocation trees
Fixes:
- when subvolume deletion fails due to ENOSPC, the filesystem is not
turned read-only
- device scan deals with devices from other filesystems that changed
ownership due to overwrite (mkfs)
- fix a race between scrub and block group removal/allocation
- fix long standing bug of a runaway balance operation, printing the
same line to the syslog, caused by a stale status bit on a reloc
tree that prevented progress
- fix corrupt log due to concurrent fsync of inodes with shared
extents
- fix space underflow for NODATACOW and buffered writes when it for
some reason needs to fallback to COW mode"
* tag 'for-5.8-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (133 commits)
btrfs: fix space_info bytes_may_use underflow during space cache writeout
btrfs: fix space_info bytes_may_use underflow after nocow buffered write
btrfs: fix wrong file range cleanup after an error filling dealloc range
btrfs: remove redundant local variable in read_block_for_search
btrfs: open code key_search
btrfs: split btrfs_direct_IO to read and write part
btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK
fs: remove dio_end_io()
btrfs: switch to iomap_dio_rw() for dio
iomap: remove lockdep_assert_held()
iomap: add a filesystem hook for direct I/O bio submission
fs: export generic_file_buffered_read()
btrfs: turn space cache writeout failure messages into debug messages
btrfs: include error on messages about failure to write space/inode caches
btrfs: remove useless 'fail_unlock' label from btrfs_csum_file_blocks()
btrfs: do not ignore error from btrfs_next_leaf() when inserting checksums
btrfs: make checksum item extension more efficient
btrfs: fix corrupt log due to concurrent fsync of inodes with shared extents
btrfs: unexport btrfs_compress_set_level()
btrfs: simplify iget helpers
...
52 files changed, 3224 insertions, 3045 deletions
diff --git a/.clang-format b/.clang-format index e92e6dd1780d..a0a96088c74f 100644 --- a/.clang-format +++ b/.clang-format @@ -80,6 +80,7 @@ ForEachMacros: - 'ax25_uid_for_each' - '__bio_for_each_bvec' - 'bio_for_each_bvec' + - 'bio_for_each_bvec_all' - 'bio_for_each_integrity_vec' - '__bio_for_each_segment' - 'bio_for_each_segment' diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index ad303a2569d3..36771a131b56 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -129,6 +129,7 @@ Usage of helpers: :: bio_for_each_segment_all() + bio_for_each_bvec_all() bio_first_bvec_all() bio_first_page_all() bio_last_bvec_all() @@ -143,4 +144,5 @@ Usage of helpers: bio_vec' will contain a multi-page IO vector during the iteration:: bio_for_each_bvec() + bio_for_each_bvec_all() rq_for_each_bvec() diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 575636f6491e..68b95ad82126 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -14,6 +14,7 @@ config BTRFS_FS select LZO_DECOMPRESS select ZSTD_COMPRESS select ZSTD_DECOMPRESS + select FS_IOMAP select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0cc02577577b..d888e71e66b6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -13,6 +13,7 @@ #include "transaction.h" #include "delayed-ref.h" #include "locking.h" +#include "misc.h" /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -537,18 +538,13 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, const u64 *extent_item_pos, bool ignore_offset) { struct btrfs_root *root; - struct btrfs_key root_key; struct extent_buffer *eb; int ret = 0; int root_level; int level = ref->level; struct btrfs_key search_key = ref->key_for_search; - root_key.objectid = ref->root_id; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - - root = btrfs_get_fs_root(fs_info, &root_key, false); + root = btrfs_get_fs_root(fs_info, ref->root_id, false); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; @@ -2295,3 +2291,832 @@ void free_ipath(struct inode_fs_paths *ipath) kvfree(ipath->fspath); kfree(ipath); } + +struct btrfs_backref_iter *btrfs_backref_iter_alloc( + struct btrfs_fs_info *fs_info, gfp_t gfp_flag) +{ + struct btrfs_backref_iter *ret; + + ret = kzalloc(sizeof(*ret), gfp_flag); + if (!ret) + return NULL; + + ret->path = btrfs_alloc_path(); + if (!ret) { + kfree(ret); + return NULL; + } + + /* Current backref iterator only supports iteration in commit root */ + ret->path->search_commit_root = 1; + ret->path->skip_locking = 1; + ret->fs_info = fs_info; + + return ret; +} + +int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) +{ + struct btrfs_fs_info *fs_info = iter->fs_info; + struct btrfs_path *path = iter->path; + struct btrfs_extent_item *ei; + struct btrfs_key key; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = (u64)-1; + iter->bytenr = bytenr; + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (ret == 0) { + ret = -EUCLEAN; + goto release; + } + if (path->slots[0] == 0) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + ret = -EUCLEAN; + goto release; + } + path->slots[0]--; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if ((key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) || key.objectid != bytenr) { + ret = -ENOENT; + goto release; + } + memcpy(&iter->cur_key, &key, sizeof(key)); + iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->end_ptr = (u32)(iter->item_ptr + + btrfs_item_size_nr(path->nodes[0], path->slots[0])); + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + + /* + * Only support iteration on tree backref yet. + * + * This is an extra precaution for non skinny-metadata, where + * EXTENT_ITEM is also used for tree blocks, that we can only use + * extent flags to determine if it's a tree block. + */ + if (btrfs_extent_flags(path->nodes[0], ei) & BTRFS_EXTENT_FLAG_DATA) { + ret = -ENOTSUPP; + goto release; + } + iter->cur_ptr = (u32)(iter->item_ptr + sizeof(*ei)); + + /* If there is no inline backref, go search for keyed backref */ + if (iter->cur_ptr >= iter->end_ptr) { + ret = btrfs_next_item(fs_info->extent_root, path); + + /* No inline nor keyed ref */ + if (ret > 0) { + ret = -ENOENT; + goto release; + } + if (ret < 0) + goto release; + + btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, + path->slots[0]); + if (iter->cur_key.objectid != bytenr || + (iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY && + iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY)) { + ret = -ENOENT; + goto release; + } + iter->cur_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->item_ptr = iter->cur_ptr; + iter->end_ptr = (u32)(iter->item_ptr + btrfs_item_size_nr( + path->nodes[0], path->slots[0])); + } + + return 0; +release: + btrfs_backref_iter_release(iter); + return ret; +} + +/* + * Go to the next backref item of current bytenr, can be either inlined or + * keyed. + * + * Caller needs to check whether it's inline ref or not by iter->cur_key. + * + * Return 0 if we get next backref without problem. + * Return >0 if there is no extra backref for this bytenr. + * Return <0 if there is something wrong happened. + */ +int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) +{ + struct extent_buffer *eb = btrfs_backref_get_eb(iter); + struct btrfs_path *path = iter->path; + struct btrfs_extent_inline_ref *iref; + int ret; + u32 size; + + if (btrfs_backref_iter_is_inline_ref(iter)) { + /* We're still inside the inline refs */ + ASSERT(iter->cur_ptr < iter->end_ptr); + + if (btrfs_backref_has_tree_block_info(iter)) { + /* First tree block info */ + size = sizeof(struct btrfs_tree_block_info); + } else { + /* Use inline ref type to determine the size */ + int type; + + iref = (struct btrfs_extent_inline_ref *) + ((unsigned long)iter->cur_ptr); + type = btrfs_extent_inline_ref_type(eb, iref); + + size = btrfs_extent_inline_ref_size(type); + } + iter->cur_ptr += size; + if (iter->cur_ptr < iter->end_ptr) + return 0; + + /* All inline items iterated, fall through */ + } + + /* We're at keyed items, there is no inline item, go to the next one */ + ret = btrfs_next_item(iter->fs_info->extent_root, iter->path); + if (ret) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], &iter->cur_key, path->slots[0]); + if (iter->cur_key.objectid != iter->bytenr || + (iter->cur_key.type != BTRFS_TREE_BLOCK_REF_KEY && + iter->cur_key.type != BTRFS_SHARED_BLOCK_REF_KEY)) + return 1; + iter->item_ptr = (u32)btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + iter->cur_ptr = iter->item_ptr; + iter->end_ptr = iter->item_ptr + (u32)btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + return 0; +} + +void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, + struct btrfs_backref_cache *cache, int is_reloc) +{ + int i; + + cache->rb_root = RB_ROOT; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); + INIT_LIST_HEAD(&cache->pending_edge); + INIT_LIST_HEAD(&cache->useless_node); + cache->fs_info = fs_info; + cache->is_reloc = is_reloc; +} + +struct btrfs_backref_node *btrfs_backref_alloc_node( + struct btrfs_backref_cache *cache, u64 bytenr, int level) +{ + struct btrfs_backref_node *node; + + ASSERT(level >= 0 && level < BTRFS_MAX_LEVEL); + node = kzalloc(sizeof(*node), GFP_NOFS); + if (!node) + return node; + + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + node->level = level; + node->bytenr = bytenr; + + return node; +} + +struct btrfs_backref_edge *btrfs_backref_alloc_edge( + struct btrfs_backref_cache *cache) +{ + struct btrfs_backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; +} + +/* + * Drop the backref node from cache, also cleaning up all its + * upper edges and any uncached nodes in the path. + * + * This cleanup happens bottom up, thus the node should either + * be the lowest node in the cache or a detached node. + */ +void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + struct btrfs_backref_node *upper; + struct btrfs_backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest && !node->detached); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct btrfs_backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + btrfs_backref_free_edge(cache, edge); + + if (RB_EMPTY_NODE(&upper->rb_node)) { + BUG_ON(!list_empty(&node->upper)); + btrfs_backref_drop_node(cache, node); + node = upper; + node->lowest = 1; + continue; + } + /* + * Add the node to leaf node list if no other child block + * cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, &cache->leaves); + upper->lowest = 1; + } + } + + btrfs_backref_drop_node(cache, node); +} + +/* + * Release all nodes/edges from current cache + */ +void btrfs_backref_release_cache(struct btrfs_backref_cache *cache) +{ + struct btrfs_backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct btrfs_backref_node, list); + btrfs_backref_cleanup_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct btrfs_backref_node, lower); + btrfs_backref_cleanup_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + ASSERT(list_empty(&cache->pending[i])); + ASSERT(list_empty(&cache->pending_edge)); + ASSERT(list_empty(&cache->useless_node)); + ASSERT(list_empty(&cache->changed)); + ASSERT(list_empty(&cache->detached)); + ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); + ASSERT(!cache->nr_nodes); + ASSERT(!cache->nr_edges); +} + +/* + * Handle direct tree backref + * + * Direct tree backref means, the backref item shows its parent bytenr + * directly. This is for SHARED_BLOCK_REF backref (keyed or inlined). + * + * @ref_key: The converted backref key. + * For keyed backref, it's the item key. + * For inlined backref, objectid is the bytenr, + * type is btrfs_inline_ref_type, offset is + * btrfs_inline_ref_offset. + */ +static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, + struct btrfs_key *ref_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *upper; + struct rb_node *rb_node; + + ASSERT(ref_key->type == BTRFS_SHARED_BLOCK_REF_KEY); + + /* Only reloc root uses backref pointing to itself */ + if (ref_key->objectid == ref_key->offset) { + struct btrfs_root *root; + + cur->is_reloc_root = 1; + /* Only reloc backref cache cares about a specific root */ + if (cache->is_reloc) { + root = find_reloc_root(cache->fs_info, cur->bytenr); + if (WARN_ON(!root)) + return -ENOENT; + cur->root = root; + } else { + /* + * For generic purpose backref cache, reloc root node + * is useless. + */ + list_add(&cur->list, &cache->useless_node); + } + return 0; + } + + edge = btrfs_backref_alloc_edge(cache); + if (!edge) + return -ENOMEM; + + rb_node = rb_simple_search(&cache->rb_root, ref_key->offset); + if (!rb_node) { + /* Parent node not yet cached */ + upper = btrfs_backref_alloc_node(cache, ref_key->offset, + cur->level + 1); + if (!upper) { + btrfs_backref_free_edge(cache, edge); + return -ENOMEM; + } + + /* + * Backrefs for the upper level block isn't cached, add the + * block to pending list + */ + list_add_tail(&edge->list[UPPER], &cache->pending_edge); + } else { + /* Parent node already cached */ + upper = rb_entry(rb_node, struct btrfs_backref_node, rb_node); + ASSERT(upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER); + return 0; +} + +/* + * Handle indirect tree backref + * + * Indirect tree backref means, we only know which tree the node belongs to. + * We still need to do a tree search to find out the parents. This is for + * TREE_BLOCK_REF backref (keyed or inlined). + * + * @ref_key: The same as @ref_key in handle_direct_tree_backref() + * @tree_key: The first key of this tree block. + * @path: A clean (released) path, to avoid allocating path everytime + * the function get called. + */ +static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, + struct btrfs_path *path, + struct btrfs_key *ref_key, + struct btrfs_key *tree_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_backref_node *upper; + struct btrfs_backref_node *lower; + struct btrfs_backref_edge *edge; + struct extent_buffer *eb; + struct btrfs_root *root; + struct rb_node *rb_node; + int level; + bool need_check = true; + int ret; + + root = btrfs_get_fs_root(fs_info, ref_key->offset, false); + if (IS_ERR(root)) + return PTR_ERR(root); + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + cur->cowonly = 1; + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* Tree root */ + ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); + /* + * For reloc backref cache, we may ignore reloc root. But for + * general purpose backref cache, we can't rely on + * btrfs_should_ignore_reloc_root() as it may conflict with + * current running relocation and lead to missing root. + * + * For general purpose backref cache, reloc root detection is + * completely relying on direct backref (key->offset is parent + * bytenr), thus only do such check for reloc cache. + */ + if (btrfs_should_ignore_reloc_root(root) && cache->is_reloc) { + btrfs_put_root(root); + list_add(&cur->list, &cache->useless_node); + } else { + cur->root = root; + } + return 0; + } + + level = cur->level + 1; + + /* Search the tree to find parent blocks referring to the block */ + path->search_commit_root = 1; + path->skip_locking = 1; + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + btrfs_put_root(root); + return ret; + } + if (ret > 0 && path->slots[level] > 0) + path->slots[level]--; + + eb = path->nodes[level]; + if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { + btrfs_err(fs_info, +"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", + cur->bytenr, level - 1, root->root_key.objectid, + tree_key->objectid, tree_key->type, tree_key->offset); + btrfs_put_root(root); + ret = -ENOENT; + goto out; + } + lower = cur; + + /* Add all nodes and edges in the path */ + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) { + ASSERT(btrfs_root_bytenr(&root->root_item) == + lower->bytenr); + /* Same as previous should_ignore_reloc_root() call */ + if (btrfs_should_ignore_reloc_root(root) && + cache->is_reloc) { + btrfs_put_root(root); + list_add(&lower->list, &cache->useless_node); + } else { + lower->root = root; + } + break; + } + + edge = btrfs_backref_alloc_edge(cache); + if (!edge) { + btrfs_put_root(root); + ret = -ENOMEM; + goto out; + } + + eb = path->nodes[level]; + rb_node = rb_simple_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = btrfs_backref_alloc_node(cache, eb->start, + lower->level + 1); + if (!upper) { + btrfs_put_root(root); + btrfs_backref_free_edge(cache, edge); + ret = -ENOMEM; + goto out; + } + upper->owner = btrfs_header_owner(eb); + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + upper->cowonly = 1; + + /* + * If we know the block isn't shared we can avoid + * checking its backrefs. + */ + if (btrfs_block_can_be_shared(root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * Add the block to pending list if we need to check its + * backrefs, we only do this once while walking up a + * tree as we will catch anything else later on. + */ + if (!upper->checked && need_check) { + need_check = false; + list_add_tail(&edge->list[UPPER], + &cache->pending_edge); + } else { + if (upper->checked) + need_check = true; + INIT_LIST_HEAD(&edge->list[UPPER]); + } + } else { + upper = rb_entry(rb_node, struct btrfs_backref_node, + rb_node); + ASSERT(upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); + } + btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER); + + if (rb_node) { + btrfs_put_root(root); + break; + } + lower = upper; + upper = NULL; + } +out: + btrfs_release_path(path); + return ret; +} + +/* + * Add backref node @cur into @cache. + * + * NOTE: Even if the function returned 0, @cur is not yet cached as its upper + * links aren't yet bi-directional. Needs to finish such links. + * Use btrfs_backref_finish_upper_links() to finish such linkage. + * + * @path: Released path for indirect tree backref lookup + * @iter: Released backref iter for extent tree search + * @node_key: The first key of the tree block + */ +int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, + struct btrfs_path *path, + struct btrfs_backref_iter *iter, + struct btrfs_key *node_key, + struct btrfs_backref_node *cur) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *exist; + int ret; + + ret = btrfs_backref_iter_start(iter, cur->bytenr); + if (ret < 0) + return ret; + /* + * We skip the first btrfs_tree_block_info, as we don't use the key + * stored in it, but fetch it from the tree block + */ + if (btrfs_backref_has_tree_block_info(iter)) { + ret = btrfs_backref_iter_next(iter); + if (ret < 0) + goto out; + /* No extra backref? This means the tree block is corrupted */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * The backref was added previously when processing backref of + * type BTRFS_TREE_BLOCK_REF_KEY + */ + ASSERT(list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct btrfs_backref_edge, + list[LOWER]); + ASSERT(list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * Add the upper level block to pending list if we need check + * its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &cache->pending_edge); + } else { + exist = NULL; + } + + for (; ret == 0; ret = btrfs_backref_iter_next(iter)) { + struct extent_buffer *eb; + struct btrfs_key key; + int type; + + cond_resched(); + eb = btrfs_backref_get_eb(iter); + + key.objectid = iter->bytenr; + if (btrfs_backref_iter_is_inline_ref(iter)) { + struct btrfs_extent_inline_ref *iref; + + /* Update key for inline backref */ + iref = (struct btrfs_extent_inline_ref *) + ((unsigned long)iter->cur_ptr); + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) { + ret = -EUCLEAN; + goto out; + } + key.type = type; + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + } else { + key.type = iter->cur_key.type; + key.offset = iter->cur_key.offset; + } + + /* + * Parent node found and matches current inline ref, no need to + * rebuild this node for this inline ref + */ + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + continue; + } + + /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { + ret = handle_direct_tree_backref(cache, &key, cur); + if (ret < 0) + goto out; + continue; + } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { + ret = -EINVAL; + btrfs_print_v0_err(fs_info); + btrfs_handle_fs_error(fs_info, ret, NULL); + goto out; + } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { + continue; + } + + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset + * means the root objectid. We need to search the tree to get + * its parent bytenr. + */ + ret = handle_indirect_tree_backref(cache, path, &key, node_key, + cur); + if (ret < 0) + goto out; + } + ret = 0; + cur->checked = 1; + WARN_ON(exist); +out: + btrfs_backref_iter_release(iter); + return ret; +} + +/* + * Finish the upwards linkage created by btrfs_backref_add_tree_node() + */ +int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *start) +{ + struct list_head *useless_node = &cache->useless_node; + struct btrfs_backref_edge *edge; + struct rb_node *rb_node; + LIST_HEAD(pending_edge); + + ASSERT(start->checked); + + /* Insert this node to cache if it's not COW-only */ + if (!start->cowonly) { + rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, + &start->rb_node); + if (rb_node) + btrfs_backref_panic(cache->fs_info, start->bytenr, + -EEXIST); + list_add_tail(&start->lower, &cache->leaves); + } + + /* + * Use breadth first search to iterate all related edges. + * + * The starting points are all the edges of this node + */ + list_for_each_entry(edge, &start->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &pending_edge); + + while (!list_empty(&pending_edge)) { + struct btrfs_backref_node *upper; + struct btrfs_backref_node *lower; + struct rb_node *rb_node; + + edge = list_first_entry(&pending_edge, + struct btrfs_backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + lower = edge->node[LOWER]; + + /* Parent is detached, no need to keep any edges */ + if (upper->detached) { + list_del(&edge->list[LOWER]); + btrfs_backref_free_edge(cache, edge); + + /* Lower node is orphan, queue for cleanup */ + if (list_empty(&lower->upper)) + list_add(&lower->list, useless_node); + continue; + } + + /* + * All new nodes added in current build_backref_tree() haven't + * been linked to the cache rb tree. + * So if we have upper->rb_node populated, this means a cache + * hit. We only need to link the edge, as @upper and all its + * parents have already been linked. + */ + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + /* Sanity check, we shouldn't have any unchecked nodes */ + if (!upper->checked) { + ASSERT(0); + return -EUCLEAN; + } + + /* Sanity check, COW-only node has non-COW-only parent */ + if (start->cowonly != upper->cowonly) { + ASSERT(0); + return -EUCLEAN; + } + + /* Only cache non-COW-only (subvolume trees) tree blocks */ + if (!upper->cowonly) { + rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + if (rb_node) { + btrfs_backref_panic(cache->fs_info, + upper->bytenr, -EEXIST); + return -EUCLEAN; + } + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + + /* + * Also queue all the parent edges of this uncached node + * to finish the upper linkage + */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &pending_edge); + } + return 0; +} + +void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + struct btrfs_backref_node *lower; + struct btrfs_backref_node *upper; + struct btrfs_backref_edge *edge; + + while (!list_empty(&cache->useless_node)) { + lower = list_first_entry(&cache->useless_node, + struct btrfs_backref_node, list); + list_del_init(&lower->list); + } + while (!list_empty(&cache->pending_edge)) { + edge = list_first_entry(&cache->pending_edge, + struct btrfs_backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + upper = edge->node[UPPER]; + btrfs_backref_free_edge(cache, edge); + + /* + * Lower is no longer linked to any upper backref nodes and + * isn't in the cache, we can free it ourselves. + */ + if (list_empty(&lower->upper) && + RB_EMPTY_NODE(&lower->rb_node)) + list_add(&lower->list, &cache->useless_node); + + if (!RB_EMPTY_NODE(&upper->rb_node)) + continue; + + /* Add this guy's upper edges to the list to process */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], + &cache->pending_edge); + if (list_empty(&upper->upper)) + list_add(&upper->list, &cache->useless_node); + } + + while (!list_empty(&cache->useless_node)) { + lower = list_first_entry(&cache->useless_node, + struct btrfs_backref_node, list); + list_del_init(&lower->list); + if (lower == node) + node = NULL; + btrfs_backref_free_node(cache, lower); + } + + btrfs_backref_cleanup_node(cache, node); + ASSERT(list_empty(&cache->useless_node) && + list_empty(&cache->pending_edge)); +} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 723d6da99114..ff705cc564a9 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -8,6 +8,7 @@ #include <linux/btrfs.h> #include "ulist.h" +#include "disk-io.h" #include "extent_io.h" struct inode_fs_paths { @@ -78,4 +79,300 @@ struct prelim_ref { u64 wanted_disk_byte; }; +/* + * Iterate backrefs of one extent. + * + * Now it only supports iteration of tree block in commit root. + */ +struct btrfs_backref_iter { + u64 bytenr; + struct btrfs_path *path; + struct btrfs_fs_info *fs_info; + struct btrfs_key cur_key; + u32 item_ptr; + u32 cur_ptr; + u32 end_ptr; +}; + +struct btrfs_backref_iter *btrfs_backref_iter_alloc( + struct btrfs_fs_info *fs_info, gfp_t gfp_flag); + +static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) +{ + if (!iter) + return; + btrfs_free_path(iter->path); + kfree(iter); +} + +static inline struct extent_buffer *btrfs_backref_get_eb( + struct btrfs_backref_iter *iter) +{ + if (!iter) + return NULL; + return iter->path->nodes[0]; +} + +/* + * For metadata with EXTENT_ITEM key (non-skinny) case, the first inline data + * is btrfs_tree_block_info, without a btrfs_extent_inline_ref header. + * + * This helper determines if that's the case. + */ +static inline bool btrfs_backref_has_tree_block_info( + struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY && + iter->cur_ptr - iter->item_ptr == sizeof(struct btrfs_extent_item)) + return true; + return false; +} + +int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr); + +int btrfs_backref_iter_next(struct btrfs_backref_iter *iter); + +static inline bool btrfs_backref_iter_is_inline_ref( + struct btrfs_backref_iter *iter) +{ + if (iter->cur_key.type == BTRFS_EXTENT_ITEM_KEY || + iter->cur_key.type == BTRFS_METADATA_ITEM_KEY) + return true; + return false; +} + +static inline void btrfs_backref_iter_release(struct btrfs_backref_iter *iter) +{ + iter->bytenr = 0; + iter->item_ptr = 0; + iter->cur_ptr = 0; + iter->end_ptr = 0; + btrfs_release_path(iter->path); + memset(&iter->cur_key, 0, sizeof(iter->cur_key)); +} + +/* + * Backref cache related structures + * + * The whole objective of backref_cache is to build a bi-directional map + * of tree blocks (represented by backref_node) and all their parents. + */ + +/* + * Represent a tree block in the backref cache + */ +struct btrfs_backref_node { + struct { + struct rb_node rb_node; + u64 bytenr; + }; /* Use rb_simple_node for search/insert */ + + u64 new_bytenr; + /* Objectid of tree block owner, can be not uptodate */ + u64 owner; + /* Link to pending, changed or detached list */ + struct list_head list; + + /* List of upper level edges, which link this node to its parents */ + struct list_head upper; + /* List of lower level edges, which link this node to its children */ + struct list_head lower; + + /* NULL if this node is not tree root */ + struct btrfs_root *root; + /* Extent buffer got by COWing the block */ + struct extent_buffer *eb; + /* Level of the tree block */ + unsigned int level:8; + /* Is the block in a non-shareable tree */ + unsigned int cowonly:1; + /* 1 if no child node is in the cache */ + unsigned int lowest:1; + /* Is the extent buffer locked */ + unsigned int locked:1; + /* Has the block been processed */ + unsigned int processed:1; + /* Have backrefs of this block been checked */ + unsigned int checked:1; + /* + * 1 if corresponding block has been COWed but some upper level block + * pointers may not point to the new location + */ + unsigned int pending:1; + /* 1 if the backref node isn't connected to any other backref node */ + unsigned int detached:1; + + /* + * For generic purpose backref cache, where we only care if it's a reloc + * root, doesn't care the source subvolid. + */ + unsigned int is_reloc_root:1; +}; + +#define LOWER 0 +#define UPPER 1 + +/* + * Represent an edge connecting upper and lower backref nodes. + */ +struct btrfs_backref_edge { + /* + * list[LOWER] is linked to btrfs_backref_node::upper of lower level + * node, and list[UPPER] is linked to btrfs_backref_node::lower of + * upper level node. + * + * Also, build_backref_tree() uses list[UPPER] for pending edges, before + * linking list[UPPER] to its upper level nodes. + */ + struct list_head list[2]; + + /* Two related nodes */ + struct btrfs_backref_node *node[2]; +}; + +struct btrfs_backref_cache { + /* Red black tree of all backref nodes in the cache */ + struct rb_root rb_root; + /* For passing backref nodes to btrfs_reloc_cow_block */ + struct btrfs_backref_node *path[BTRFS_MAX_LEVEL]; + /* + * List of blocks that have been COWed but some block pointers in upper + * level blocks may not reflect the new location + */ + struct list_head pending[BTRFS_MAX_LEVEL]; + /* List of backref nodes with no child node */ + struct list_head leaves; + /* List of blocks that have been COWed in current transaction */ + struct list_head changed; + /* List of detached backref node. */ + struct list_head detached; + + u64 last_trans; + + int nr_nodes; + int nr_edges; + + /* List of unchecked backref edges during backref cache build */ + struct list_head pending_edge; + + /* List of useless backref nodes during backref cache build */ + struct list_head useless_node; + + struct btrfs_fs_info *fs_info; + + /* + * Whether this cache is for relocation + * + * Reloction backref cache require more info for reloc root compared + * to generic backref cache. + */ + unsigned int is_reloc; +}; + +void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, + struct btrfs_backref_cache *cache, int is_reloc); +struct btrfs_backref_node *btrfs_backref_alloc_node( + struct btrfs_backref_cache *cache, u64 bytenr, int level); +struct btrfs_backref_edge *btrfs_backref_alloc_edge( + struct btrfs_backref_cache *cache); + +#define LINK_LOWER (1 << 0) +#define LINK_UPPER (1 << 1) +static inline void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, + struct btrfs_backref_node *lower, + struct btrfs_backref_node *upper, + int link_which) +{ + ASSERT(upper && lower && upper->level == lower->level + 1); + edge->node[LOWER] = lower; + edge->node[UPPER] = upper; + if (link_which & LINK_LOWER) + list_add_tail(&edge->list[LOWER], &lower->upper); + if (link_which & LINK_UPPER) + list_add_tail(&edge->list[UPPER], &upper->lower); +} + +static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node) +{ + if (node) { + cache->nr_nodes--; + btrfs_put_root(node->root); + kfree(node); + } +} + +static inline void btrfs_backref_free_edge(struct btrfs_backref_cache *cache, + struct btrfs_backref_edge *edge) +{ + if (edge) { + cache->nr_edges--; + kfree(edge); + } +} + +static inline void btrfs_backref_unlock_node_buffer( + struct btrfs_backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + +static inline void btrfs_backref_drop_node_buffer( + struct btrfs_backref_node *node) +{ + if (node->eb) { + btrfs_backref_unlock_node_buffer(node); + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +/* + * Drop the backref node from cache without cleaning up its children + * edges. + * + * This can only be called on node without parent edges. + * The children edges are still kept as is. + */ +static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, + struct btrfs_backref_node *node) +{ + BUG_ON(!list_empty(&node->upper)); + + btrfs_backref_drop_node_buffer(node); + list_del(&node->list); + list_del(&node->lower); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + btrfs_backref_free_node(tree, node); +} + +void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); + +void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); + +static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info, + u64 bytenr, int errno) +{ + btrfs_panic(fs_info, errno, + "Inconsistency in backref cache found at offset %llu", + bytenr); +} + +int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, + struct btrfs_path *path, + struct btrfs_backref_iter *iter, + struct btrfs_key *node_key, + struct btrfs_backref_node *cur); + +int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *start); + +void btrfs_backref_error_cleanup(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node); + #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 696f47103cfc..176e8a292fd1 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -7,7 +7,6 @@ #include "disk-io.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "disk-io.h" #include "volumes.h" #include "transaction.h" #include "ref-verify.h" @@ -161,6 +160,8 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct rb_node *parent = NULL; struct btrfs_block_group *cache; + ASSERT(block_group->length != 0); + spin_lock(&info->block_group_cache_lock); p = &info->block_group_cache_tree.rb_node; @@ -863,11 +864,34 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) } } +static int remove_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root; + struct btrfs_key key; + int ret; + + root = fs_info->extent_root; + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + return ret; + + ret = btrfs_del_item(trans, root, path); + return ret; +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *root = fs_info->extent_root; struct btrfs_path *path; struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; @@ -1065,26 +1089,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); - key.objectid = block_group->start; - key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - key.offset = block_group->length; - mutex_lock(&fs_info->chunk_mutex); spin_lock(&block_group->lock); block_group->removed = 1; /* - * At this point trimming can't start on this block group, because we - * removed the block group from the tree fs_info->block_group_cache_tree - * so no one can't find it anymore and even if someone already got this - * block group before we removed it from the rbtree, they have already - * incremented block_group->trimming - if they didn't, they won't find - * any free space entries because we already removed them all when we - * called btrfs_remove_free_space_cache(). + * At this point trimming or scrub can't start on this block group, + * because we removed the block group from the rbtree + * fs_info->block_group_cache_tree so no one can't find it anymore and + * even if someone already got this block group before we removed it + * from the rbtree, they have already incremented block_group->frozen - + * if they didn't, for the trimming case they won't find any free space + * entries because we already removed them all when we called + * btrfs_remove_free_space_cache(). * * And we must not remove the extent map from the fs_info->mapping_tree * to prevent the same logical address range and physical device space - * ranges from being reused for a new block group. This is because our - * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * ranges from being reused for a new block group. This is needed to + * avoid races with trimming and scrub. + * + * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is * completely transactionless, so while it is trimming a range the * currently running transaction might finish and a new one start, * allowing for new block groups to be created that can reuse the same @@ -1095,7 +1118,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * in place until the extents have been discarded completely when * the transaction commit has completed. */ - remove_em = (atomic_read(&block_group->trimming) == 0); + remove_em = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); mutex_unlock(&fs_info->chunk_mutex); @@ -1107,16 +1130,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret > 0) - ret = -EIO; + ret = remove_block_group_item(trans, path, block_group); if (ret < 0) goto out; - ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; - if (remove_em) { struct extent_map_tree *em_tree; @@ -1175,7 +1192,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( free_extent_map(em); return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root, - num_items, 1); + num_items); } /* @@ -1284,25 +1301,17 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, EXTENT_DIRTY); if (ret) - goto err; + goto out; } ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, EXTENT_DIRTY); - if (ret) - goto err; +out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) btrfs_put_transaction(prev_trans); - return true; - -err: - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - if (prev_trans) - btrfs_put_transaction(prev_trans); - btrfs_dec_block_group_ro(bg); - return false; + return ret == 0; } /* @@ -1400,8 +1409,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * We could have pending pinned extents for this block group, * just delete them, we don't care about them anymore. */ - if (!clean_pinned_extents(trans, block_group)) + if (!clean_pinned_extents(trans, block_group)) { + btrfs_dec_block_group_ro(block_group); goto end_trans; + } /* * At this point, the block_group is read only and should fail @@ -1450,7 +1461,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) /* Implicit trim during transaction commit. */ if (trimming) - btrfs_get_block_group_trimming(block_group); + btrfs_freeze_block_group(block_group); /* * Btrfs_remove_chunk will abort the transaction if things go @@ -1460,7 +1471,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (ret) { if (trimming) - btrfs_put_block_group_trimming(block_group); + btrfs_unfreeze_block_group(block_group); goto end_trans; } @@ -1774,7 +1785,7 @@ static void link_block_group(struct btrfs_block_group *cache) } static struct btrfs_block_group *btrfs_create_block_group_cache( - struct btrfs_fs_info *fs_info, u64 start, u64 size) + struct btrfs_fs_info *fs_info, u64 start) { struct btrfs_block_group *cache; @@ -1790,7 +1801,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( } cache->start = start; - cache->length = size; cache->fs_info = fs_info; cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); @@ -1809,7 +1819,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); - atomic_set(&cache->trimming, 0); + atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); @@ -1870,25 +1880,44 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } +static int read_block_group_item(struct btrfs_block_group *cache, + struct btrfs_path *path, + const struct btrfs_key *key) +{ + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_block_group_item bgi; + int slot = path->slots[0]; + + cache->length = key->offset; + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + cache->used = btrfs_stack_block_group_used(&bgi); + cache->flags = btrfs_stack_block_group_flags(&bgi); + + return 0; +} + static int read_one_block_group(struct btrfs_fs_info *info, struct btrfs_path *path, const struct btrfs_key *key, int need_clear) { - struct extent_buffer *leaf = path->nodes[0]; struct btrfs_block_group *cache; struct btrfs_space_info *space_info; - struct btrfs_block_group_item bgi; const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS); - int slot = path->slots[0]; int ret; ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); - cache = btrfs_create_block_group_cache(info, key->objectid, key->offset); + cache = btrfs_create_block_group_cache(info, key->objectid); if (!cache) return -ENOMEM; + ret = read_block_group_item(cache, path, key); + if (ret < 0) + goto error; + if (need_clear) { /* * When we mount with old space cache, we need to @@ -1903,10 +1932,6 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (btrfs_test_opt(info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } - read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); - cache->used = btrfs_stack_block_group_used(&bgi); - cache->flags = btrfs_stack_block_group_flags(&bgi); if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) && (cache->flags & BTRFS_BLOCK_GROUP_DATA))) { btrfs_err(info, @@ -1934,15 +1959,15 @@ static int read_one_block_group(struct btrfs_fs_info *info, * are empty, and we can just add all the space in and be done with it. * This saves us _a_lot_ of time, particularly in the full case. */ - if (key->offset == cache->used) { + if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, key->objectid, - key->objectid + key->offset); + add_new_free_space(cache, cache->start, + cache->start + cache->length); btrfs_free_excluded_extents(cache); } @@ -1952,7 +1977,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } trace_btrfs_add_block_group(info, cache, 0); - btrfs_update_space_info(info, cache->flags, key->offset, + btrfs_update_space_info(info, cache->flags, cache->length, cache->used, cache->bytes_super, &space_info); cache->space_info = space_info; @@ -1991,7 +2016,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(info->super_copy); if (btrfs_test_opt(info, SPACE_CACHE) && @@ -2046,13 +2070,32 @@ error: return ret; } +static int insert_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_group_item bgi; + struct btrfs_root *root; + struct btrfs_key key; + + spin_lock(&block_group->lock); + btrfs_set_stack_block_group_used(&bgi, block_group->used); + btrfs_set_stack_block_group_chunk_objectid(&bgi, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_stack_block_group_flags(&bgi, block_group->flags); + key.objectid = block_group->start; + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = block_group->length; + spin_unlock(&block_group->lock); + + root = fs_info->extent_root; + return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); +} + void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group; - struct btrfs_root *extent_root = fs_info->extent_root; - struct btrfs_block_group_item item; - struct btrfs_key key; int ret = 0; if (!trans->can_flush_pending_bgs) @@ -2065,21 +2108,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) goto next; - spin_lock(&block_group->lock); - btrfs_set_stack_block_group_used(&item, block_group->used); - btrfs_set_stack_block_group_chunk_objectid(&item, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); - btrfs_set_stack_block_group_flags(&item, block_group->flags); - key.objectid = block_group->start; - key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - key.offset = block_group->length; - spin_unlock(&block_group->lock); - - ret = btrfs_insert_item(trans, extent_root, &key, &item, - sizeof(item)); + ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); - ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset); + ret = btrfs_finish_chunk_alloc(trans, block_group->start, + block_group->length); if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); @@ -2100,10 +2133,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, btrfs_set_log_full_commit(trans); - cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); + cache = btrfs_create_block_group_cache(fs_info, chunk_offset); if (!cache) return -ENOMEM; + cache->length = size; cache->used = bytes_used; cache->flags = type; cache->last_byte_to_unpin = (u64)-1; @@ -2314,13 +2348,13 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_unlock(&sinfo->lock); } -static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_block_group *cache) +static int update_block_group_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; - struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *root = fs_info->extent_root; unsigned long bi; struct extent_buffer *leaf; struct btrfs_block_group_item bgi; @@ -2330,7 +2364,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = cache->length; - ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -2642,7 +2676,7 @@ again: } } if (!ret) { - ret = write_one_cache_group(trans, path, cache); + ret = update_block_group_item(trans, path, cache); /* * Our block group might still be attached to the list * of new block groups in the transaction handle of some @@ -2791,7 +2825,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) } } if (!ret) { - ret = write_one_cache_group(trans, path, cache); + ret = update_block_group_item(trans, path, cache); /* * One of the free space endio workers might have * created a new block group while updating a free space @@ -2808,7 +2842,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) if (ret == -ENOENT) { wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - ret = write_one_cache_group(trans, path, cache); + ret = update_block_group_item(trans, path, cache); } if (ret) btrfs_abort_transaction(trans, ret); @@ -3384,3 +3418,44 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } return 0; } + +void btrfs_freeze_block_group(struct btrfs_block_group *cache) +{ + atomic_inc(&cache->frozen); +} + +void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct extent_map_tree *em_tree; + struct extent_map *em; + bool cleanup; + + spin_lock(&block_group->lock); + cleanup = (atomic_dec_and_test(&block_group->frozen) && + block_group->removed); + spin_unlock(&block_group->lock); + + if (cleanup) { + mutex_lock(&fs_info->chunk_mutex); + em_tree = &fs_info->mapping_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->start, + 1); + BUG_ON(!em); /* logic error, can't happen */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + mutex_unlock(&fs_info->chunk_mutex); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + + /* + * We may have left one free space entry and other possible + * tasks trimming this block group have left 1 entry each one. + * Free them if any. + */ + __btrfs_remove_free_space_cache(block_group->free_space_ctl); + } +} diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 107bb557ca8d..b6ee70a039c7 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -129,8 +129,17 @@ struct btrfs_block_group { /* For read-only block groups */ struct list_head ro_list; + /* + * When non-zero it means the block group's logical address and its + * device extents can not be reused for future block group allocations + * until the counter goes down to 0. This is to prevent them from being + * reused while some task is still using the block group after it was + * deleted - we want to make sure they can only be reused for new block + * groups after that task is done with the deleted block group. + */ + atomic_t frozen; + /* For discard operations */ - atomic_t trimming; struct list_head discard_list; int discard_index; u64 discard_eligible_time; @@ -283,6 +292,9 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) cache->cached == BTRFS_CACHE_ERROR; } +void btrfs_freeze_block_group(struct btrfs_block_group *cache); +void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 27efec8f7c5b..7e1549a84fcc 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -5,6 +5,7 @@ #include "block-rsv.h" #include "space-info.h" #include "transaction.h" +#include "block-group.h" /* * HOW DO BLOCK RESERVES WORK @@ -405,6 +406,8 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) else block_rsv->full = 0; + if (block_rsv->size >= sinfo->total_bytes) + sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&block_rsv->lock); spin_unlock(&sinfo->lock); } @@ -455,7 +458,7 @@ static struct btrfs_block_rsv *get_block_rsv( struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv = NULL; - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || (root == fs_info->csum_root && trans->adding_csums) || (root == fs_info->uuid_root)) block_rsv = trans->block_rsv; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 27a1fefce508..aeff56a0e105 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -7,6 +7,7 @@ #define BTRFS_INODE_H #include <linux/hash.h> +#include <linux/refcount.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" @@ -27,7 +28,6 @@ enum { BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, - BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, }; @@ -293,53 +293,25 @@ static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) return ret; } -#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 - struct btrfs_dio_private { struct inode *inode; - unsigned long flags; u64 logical_offset; u64 disk_bytenr; u64 bytes; - void *private; - - /* number of bios pending for this dio */ - atomic_t pending_bios; - /* IO errors */ - int errors; - - /* orig_bio is our btrfs_io_bio */ - struct bio *orig_bio; + /* + * References to this structure. There is one reference per in-flight + * bio plus one while we're still setting up. + */ + refcount_t refs; /* dio_bio came from fs/direct-io.c */ struct bio *dio_bio; - /* - * The original bio may be split to several sub-bios, this is - * done during endio of sub-bios - */ - blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, - blk_status_t); + /* Array of checksums */ + u8 csums[]; }; -/* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. - */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) -{ - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); -} - -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) -{ - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9ab610cc9114..c6e648603f85 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -194,11 +194,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, for (i = 0; i < cb->nr_pages; i++) { page = cb->compressed_pages[i]; - crypto_shash_init(shash); kaddr = kmap_atomic(page); - crypto_shash_update(shash, kaddr, PAGE_SIZE); + crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); kunmap_atomic(kaddr); - crypto_shash_final(shash, (u8 *)&csum); if (memcmp(&csum, cb_sum, csum_size)) { btrfs_print_data_csum_error(inode, disk_start, @@ -1142,6 +1140,22 @@ static void put_workspace(int type, struct list_head *ws) } /* + * Adjust @level according to the limits of the compression algorithm or + * fallback to default + */ +static unsigned int btrfs_compress_set_level(int type, unsigned level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + if (level == 0) + level = ops->default_level; + else + level = min(level, ops->max_level); + + return level; +} + +/* * Given an address space and start and length, compress the bytes into @pages * that are allocated on demand. * @@ -1748,19 +1762,3 @@ unsigned int btrfs_compress_str2level(unsigned int type, const char *str) return level; } - -/* - * Adjust @level according to the limits of the compression algorithm or - * fallback to default - */ -unsigned int btrfs_compress_set_level(int type, unsigned level) -{ - const struct btrfs_compress_op *ops = btrfs_compress_op[type]; - - if (level == 0) - level = ops->default_level; - else - level = min(level, ops->max_level); - - return level; -} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d253f7aa8ed5..284a3ad31350 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -140,8 +140,6 @@ extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); bool btrfs_compress_is_valid_type(const char *str, size_t len); -unsigned int btrfs_compress_set_level(int type, unsigned level); - int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index bfedbbe2311f..3a7648bff42c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -144,9 +144,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) return eb; } -/* cowonly root (everything not a reference counted cow subvolume), just get - * put onto a simple dirty list. transaction.c walks this to make sure they - * get properly updated on disk. +/* + * Cowonly root (not-shareable trees, everything not subvolume or reloc roots), + * just get put onto a simple dirty list. Transaction walks this list to make + * sure they get properly updated on disk. */ static void add_root_to_dirty_list(struct btrfs_root *root) { @@ -185,9 +186,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int level; struct btrfs_disk_key disk_key; - WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); - WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != root->last_trans); level = btrfs_header_level(buf); @@ -826,12 +827,11 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf) { /* - * Tree blocks not in reference counted trees and tree roots - * are never shared. If a block was allocated after the last - * snapshot and the block was not allocated by tree relocation, - * we know the block is not shared. + * Tree blocks not in shareable trees and tree roots are never shared. + * If a block was allocated after the last snapshot and the block was + * not allocated by tree relocation, we know the block is not shared. */ - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && buf != root->node && buf != root->commit_root && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || @@ -1024,9 +1024,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); - WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); - WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != root->last_trans); level = btrfs_header_level(buf); @@ -1065,7 +1065,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1668,15 +1668,8 @@ static noinline int generic_bin_search(struct extent_buffer *eb, { int low = 0; int high = max; - int mid; int ret; - struct btrfs_disk_key *tmp = NULL; - struct btrfs_disk_key unaligned; - unsigned long offset; - char *kaddr = NULL; - unsigned long map_start = 0; - unsigned long map_len = 0; - int err; + const int key_size = sizeof(struct btrfs_disk_key); if (low > high) { btrfs_err(eb->fs_info, @@ -1687,32 +1680,26 @@ static noinline int generic_bin_search(struct extent_buffer *eb, } while (low < high) { + unsigned long oip; + unsigned long offset; + struct btrfs_disk_key *tmp; + struct btrfs_disk_key unaligned; + int mid; + mid = (low + high) / 2; offset = p + mid * item_size; + oip = offset_in_page(offset); - if (!kaddr || offset < map_start || - (offset + sizeof(struct btrfs_disk_key)) > - map_start + map_len) { - - err = map_private_extent_buffer(eb, offset, - sizeof(struct btrfs_disk_key), - &kaddr, &map_start, &map_len); - - if (!err) { - tmp = (struct btrfs_disk_key *)(kaddr + offset - - map_start); - } else if (err == 1) { - read_extent_buffer(eb, &unaligned, - offset, sizeof(unaligned)); - tmp = &unaligned; - } else { - return err; - } + if (oip + key_size <= PAGE_SIZE) { + const unsigned long idx = offset >> PAGE_SHIFT; + char *kaddr = page_address(eb->pages[idx]); + tmp = (struct btrfs_disk_key *)(kaddr + oip); } else { - tmp = (struct btrfs_disk_key *)(kaddr + offset - - map_start); + read_extent_buffer(eb, &unaligned, offset, key_size); + tmp = &unaligned; } + ret = comp_keys(tmp, key); if (ret < 0) @@ -1733,9 +1720,9 @@ static noinline int generic_bin_search(struct extent_buffer *eb, * leaves vs nodes */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int level, int *slot) + int *slot) { - if (level == 0) + if (btrfs_header_level(eb) == 0) return generic_bin_search(eb, offsetof(struct btrfs_leaf, items), sizeof(struct btrfs_item), @@ -2348,16 +2335,15 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct btrfs_fs_info *fs_info = root->fs_info; u64 blocknr; u64 gen; - struct extent_buffer *b = *eb_ret; struct extent_buffer *tmp; struct btrfs_key first_key; int ret; int parent_level; - blocknr = btrfs_node_blockptr(b, slot); - gen = btrfs_node_ptr_generation(b, slot); - parent_level = btrfs_header_level(b); - btrfs_node_key_to_cpu(b, &first_key, slot); + blocknr = btrfs_node_blockptr(*eb_ret, slot); + gen = btrfs_node_ptr_generation(*eb_ret, slot); + parent_level = btrfs_header_level(*eb_ret); + btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { @@ -2501,19 +2487,6 @@ done: return ret; } -static int key_search(struct extent_buffer *b, const struct btrfs_key *key, - int level, int *prev_cmp, int *slot) -{ - if (*prev_cmp != 0) { - *prev_cmp = btrfs_bin_search(b, key, level, slot); - return *prev_cmp; - } - - *slot = 0; - - return 0; -} - int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 iobjectid, u64 ioff, u8 key_type, struct btrfs_key *found_key) @@ -2783,9 +2756,23 @@ cow_done: } } - ret = key_search(b, key, level, &prev_cmp, &slot); - if (ret < 0) - goto done; + /* + * If btrfs_bin_search returns an exact match (prev_cmp == 0) + * we can safely assume the target key will always be in slot 0 + * on lower levels due to the invariants BTRFS' btree provides, + * namely that a btrfs_key_ptr entry always points to the + * lowest key in the child node, thus we can skip searching + * lower levels + */ + if (prev_cmp == 0) { + slot = 0; + ret = 0; + } else { + ret = btrfs_bin_search(b, key, &slot); + prev_cmp = ret; + if (ret < 0) + goto done; + } if (level == 0) { p->slots[level] = slot; @@ -2909,7 +2896,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, int level; int lowest_unlock = 1; u8 lowest_level = 0; - int prev_cmp = -1; lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); @@ -2942,12 +2928,7 @@ again: */ btrfs_unlock_up_safe(p, level + 1); - /* - * Since we can unwind ebs we want to do a real search every - * time. - */ - prev_cmp = -1; - ret = key_search(b, key, level, &prev_cmp, &slot); + ret = btrfs_bin_search(b, key, &slot); if (ret < 0) goto done; @@ -3507,19 +3488,17 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) { struct btrfs_item *start_item; struct btrfs_item *end_item; - struct btrfs_map_token token; int data_len; int nritems = btrfs_header_nritems(l); int end = min(nritems, start + nr) - 1; if (!nr) return 0; - btrfs_init_map_token(&token, l); start_item = btrfs_item_nr(start); end_item = btrfs_item_nr(end); - data_len = btrfs_token_item_offset(l, start_item, &token) + - btrfs_token_item_size(l, start_item, &token); - data_len = data_len - btrfs_token_item_offset(l, end_item, &token); + data_len = btrfs_item_offset(l, start_item) + + btrfs_item_size(l, start_item); + data_len = data_len - btrfs_item_offset(l, end_item); data_len += sizeof(struct btrfs_item) * nr; WARN_ON(data_len < 0); return data_len; @@ -3650,8 +3629,8 @@ static noinline int __push_leaf_right(struct btrfs_path *path, push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(i); - push_space -= btrfs_token_item_size(right, item, &token); - btrfs_set_token_item_offset(right, item, push_space, &token); + push_space -= btrfs_token_item_size(&token, item); + btrfs_set_token_item_offset(&token, item, push_space); } left_nritems -= push_items; @@ -3859,10 +3838,9 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(left, item, &token); - btrfs_set_token_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size), - &token); + ioff = btrfs_token_item_offset(&token, item); + btrfs_set_token_item_offset(&token, item, + ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size)); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -3892,9 +3870,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(i); - push_space = push_space - btrfs_token_item_size(right, - item, &token); - btrfs_set_token_item_offset(right, item, push_space, &token); + push_space = push_space - btrfs_token_item_size(&token, item); + btrfs_set_token_item_offset(&token, item, push_space); } btrfs_mark_buffer_dirty(left); @@ -4036,9 +4013,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, struct btrfs_item *item = btrfs_item_nr(i); u32 ioff; - ioff = btrfs_token_item_offset(right, item, &token); - btrfs_set_token_item_offset(right, item, - ioff + rt_data_off, &token); + ioff = btrfs_token_item_offset(&token, item); + btrfs_set_token_item_offset(&token, item, ioff + rt_data_off); } btrfs_set_header_nritems(l, mid); @@ -4541,9 +4517,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) u32 ioff; item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff + size_diff, &token); + ioff = btrfs_token_item_offset(&token, item); + btrfs_set_token_item_offset(&token, item, ioff + size_diff); } /* shift the data */ @@ -4640,9 +4615,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) u32 ioff; item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - data_size, &token); + ioff = btrfs_token_item_offset(&token, item); + btrfs_set_token_item_offset(&token, item, ioff - data_size); } /* shift the data */ @@ -4718,9 +4692,9 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, u32 ioff; item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff - total_data, &token); + ioff = btrfs_token_item_offset(&token, item); + btrfs_set_token_item_offset(&token, item, + ioff - total_data); } /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), @@ -4739,10 +4713,9 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - btrfs_set_token_item_offset(leaf, item, - data_end - data_size[i], &token); + btrfs_set_token_item_offset(&token, item, data_end - data_size[i]); data_end -= data_size[i]; - btrfs_set_token_item_size(leaf, item, data_size[i], &token); + btrfs_set_token_item_size(&token, item, data_size[i]); } btrfs_set_header_nritems(leaf, nritems + nr); @@ -4930,9 +4903,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 ioff; item = btrfs_item_nr(i); - ioff = btrfs_token_item_offset(leaf, item, &token); - btrfs_set_token_item_offset(leaf, item, - ioff + dsize, &token); + ioff = btrfs_token_item_offset(&token, item); + btrfs_set_token_item_offset(&token, item, ioff + dsize); } memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), @@ -5103,7 +5075,7 @@ again: while (1) { nritems = btrfs_header_nritems(cur); level = btrfs_header_level(cur); - sret = btrfs_bin_search(cur, min_key, level, &slot); + sret = btrfs_bin_search(cur, min_key, &slot); if (sret < 0) { ret = sret; goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8aa7b9dac405..161533040978 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,6 +28,7 @@ #include <linux/dynamic_debug.h> #include <linux/refcount.h> #include <linux/crc32c.h> +#include <linux/iomap.h> #include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" @@ -582,6 +583,7 @@ struct btrfs_fs_info { struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *free_space_root; + struct btrfs_root *data_reloc_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -758,7 +760,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; - struct btrfs_workqueue *endio_repair_workers; struct btrfs_workqueue *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; @@ -970,7 +971,28 @@ enum { * is used to tell us when more checks are required */ BTRFS_ROOT_IN_TRANS_SETUP, - BTRFS_ROOT_REF_COWS, + + /* + * Set if tree blocks of this root can be shared by other roots. + * Only subvolume trees and their reloc trees have this bit set. + * Conflicts with TRACK_DIRTY bit. + * + * This affects two things: + * + * - How balance works + * For shareable roots, we need to use reloc tree and do path + * replacement for balance, and need various pre/post hooks for + * snapshot creation to handle them. + * + * While for non-shareable trees, we just simply do a tree search + * with COW. + * + * - How dirty roots are tracked + * For shareable roots, btrfs_record_root_in_trans() is needed to + * track them, while non-subvolume roots have TRACK_DIRTY bit, they + * don't need to set this manually. + */ + BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, @@ -1056,7 +1078,7 @@ struct btrfs_root { struct btrfs_key defrag_progress; struct btrfs_key defrag_max; - /* the dirty list is only used by non-reference counted roots */ + /* The dirty list is only used by non-shareable roots */ struct list_head dirty_list; struct list_head root_list; @@ -1146,6 +1168,9 @@ struct btrfs_root { /* Record pairs of swapped blocks for qgroup */ struct btrfs_qgroup_swapped_blocks swapped_blocks; + /* Used only by log trees, when logging csum items */ + struct extent_io_tree log_csum_range; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -1341,7 +1366,7 @@ do { \ BTRFS_INODE_ROOT_ITEM_INIT) struct btrfs_map_token { - const struct extent_buffer *eb; + struct extent_buffer *eb; char *kaddr; unsigned long offset; }; @@ -1353,7 +1378,8 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) { token->eb = eb; - token->kaddr = NULL; + token->kaddr = page_address(eb->pages[0]); + token->offset = 0; } /* some macros to generate set/get functions for the struct fields. This @@ -1377,15 +1403,14 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token, sizeof(((type *)0)->member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ - const void *ptr, unsigned long off, \ - struct btrfs_map_token *token); \ -void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \ - unsigned long off, u##bits val, \ - struct btrfs_map_token *token); \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off); \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off, \ + u##bits val); \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off); \ -void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val); DECLARE_BTRFS_SETGET_BITS(8) @@ -1400,25 +1425,23 @@ static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ -static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ -static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\ - const type *s, \ - struct btrfs_map_token *token) \ +static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ + const type *s) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ - return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \ + return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ -static inline void btrfs_set_token_##name(struct extent_buffer *eb, \ - type *s, u##bits val, \ - struct btrfs_map_token *token) \ +static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ + type *s, u##bits val) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ - btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \ + btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ @@ -1428,7 +1451,7 @@ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ -static inline void btrfs_set_##name(struct extent_buffer *eb, \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]); \ @@ -1446,7 +1469,7 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ } -static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb, +static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { BUILD_BUG_ON(sizeof(u64) != @@ -1454,7 +1477,7 @@ static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb, return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } -static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb, +static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { @@ -1558,13 +1581,13 @@ static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); } -static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, +static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); } -static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, +static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); @@ -1644,31 +1667,21 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); - -static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) -{ - unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); - return (unsigned long)dev + ptr; -} - BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); -BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); - - BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); -static inline void btrfs_tree_block_key(struct extent_buffer *eb, +static inline void btrfs_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } -static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, +static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { @@ -1706,12 +1719,6 @@ static inline u32 btrfs_extent_inline_ref_size(int type) return 0; } -BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); -BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, - generation, 64); -BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); -BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); - /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); @@ -1720,7 +1727,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, generation, 64); -static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) +static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + @@ -1728,7 +1735,7 @@ static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); } -static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, +static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; @@ -1737,7 +1744,7 @@ static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); } -static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) +static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + @@ -1745,7 +1752,7 @@ static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); } -static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, +static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; @@ -1763,7 +1770,7 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr) void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); -static inline void btrfs_set_node_key(struct extent_buffer *eb, +static inline void btrfs_set_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr; @@ -2498,8 +2505,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_get_block_group_trimming(struct btrfs_block_group *cache); -void btrfs_put_block_group_trimming(struct btrfs_block_group *cache); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { @@ -2512,6 +2517,7 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_LIMIT, BTRFS_RESERVE_FLUSH_EVICT, BTRFS_RESERVE_FLUSH_ALL, + BTRFS_RESERVE_FLUSH_ALL_STEAL, }; enum btrfs_flush_state { @@ -2551,7 +2557,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, - int level, int *slot); + int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, @@ -2896,10 +2902,9 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path); -struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root); +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end); @@ -2929,6 +2934,9 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); +extern const struct iomap_ops btrfs_dio_iomap_ops; +extern const struct iomap_dio_ops btrfs_dops; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3381,6 +3389,9 @@ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, + u64 bytenr); +int btrfs_should_ignore_reloc_root(struct btrfs_root *root); /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7278789ff8a7..7c6f0bbb54a5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -358,16 +358,14 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - crypto_shash_final(shash, result); + crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) return 1; @@ -709,9 +707,7 @@ static void end_workqueue_bio(struct bio *bio) else wq = fs_info->endio_write_workers; } else { - if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR)) - wq = fs_info->endio_repair_workers; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) wq = fs_info->endio_raid56_workers; else if (end_io_wq->metadata) wq = fs_info->endio_meta_workers; @@ -1135,9 +1131,12 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; - if (!dummy) + if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); + extent_io_tree_init(fs_info, &root->log_csum_range, + IO_TREE_LOG_CSUM_RANGE, NULL); + } memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -1275,12 +1274,13 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; /* - * DON'T set REF_COWS for log trees + * DON'T set SHAREABLE bit for log trees. * - * log trees do not get reference counted because they go away - * before a real commit is actually done. They do store pointers - * to file data extents, and those reference counts still get - * updated (along with back refs to the log tree). + * Log trees are not exposed to user space thus can't be snapshotted, + * and they go away before a real commit is actually done. + * + * They do store pointers to file data extents, and those reference + * counts still get updated (along with back refs to the log tree). */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, @@ -1418,8 +1418,9 @@ static int btrfs_init_fs_root(struct btrfs_root *root) if (ret) goto fail; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - set_bit(BTRFS_ROOT_REF_COWS, &root->state); + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1524,6 +1525,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->free_space_root); btrfs_put_root(fs_info->fs_root); + btrfs_put_root(fs_info->data_reloc_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1533,35 +1535,34 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *location, - bool check_ref) + u64 objectid, bool check_ref) { struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; int ret; - if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + if (objectid == BTRFS_ROOT_TREE_OBJECTID) return btrfs_grab_root(fs_info->tree_root); - if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) return btrfs_grab_root(fs_info->extent_root); - if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + if (objectid == BTRFS_CHUNK_TREE_OBJECTID) return btrfs_grab_root(fs_info->chunk_root); - if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + if (objectid == BTRFS_DEV_TREE_OBJECTID) return btrfs_grab_root(fs_info->dev_root); - if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + if (objectid == BTRFS_CSUM_TREE_OBJECTID) return btrfs_grab_root(fs_info->csum_root); - if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) + if (objectid == BTRFS_QUOTA_TREE_OBJECTID) return btrfs_grab_root(fs_info->quota_root) ? fs_info->quota_root : ERR_PTR(-ENOENT); - if (location->objectid == BTRFS_UUID_TREE_OBJECTID) + if (objectid == BTRFS_UUID_TREE_OBJECTID) return btrfs_grab_root(fs_info->uuid_root) ? fs_info->uuid_root : ERR_PTR(-ENOENT); - if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) return btrfs_grab_root(fs_info->free_space_root) ? fs_info->free_space_root : ERR_PTR(-ENOENT); again: - root = btrfs_lookup_fs_root(fs_info, location->objectid); + root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { if (check_ref && btrfs_root_refs(&root->root_item) == 0) { btrfs_put_root(root); @@ -1570,7 +1571,10 @@ again: return root; } - root = btrfs_read_tree_root(fs_info->tree_root, location); + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = btrfs_read_tree_root(fs_info->tree_root, &key); if (IS_ERR(root)) return root; @@ -1590,7 +1594,7 @@ again: } key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; - key.offset = location->objectid; + key.offset = objectid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); btrfs_free_path(path); @@ -1940,7 +1944,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); - btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); @@ -1981,6 +1984,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); + free_root_extent_buffers(info->data_reloc_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); @@ -1993,6 +1997,7 @@ void btrfs_put_root(struct btrfs_root *root) if (refcount_dec_and_test(&root->refs)) { WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); btrfs_drew_lock_destroy(&root->snapshot_lock); @@ -2143,8 +2148,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_raid56_workers = btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, max_active, 4); - fs_info->endio_repair_workers = - btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = @@ -2168,7 +2171,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && - fs_info->endio_repair_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && @@ -2290,6 +2292,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->csum_root = root; + /* + * This tree can share blocks with some other fs tree during relocation + * and we need a proper setup by btrfs_get_fs_root + */ + root = btrfs_get_fs_root(tree_root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID, true); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out; + } + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { @@ -2827,7 +2842,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device u64 generation; u64 features; u16 csum_type; - struct btrfs_key location; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; @@ -3241,11 +3255,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } } - location.objectid = BTRFS_FS_TREE_OBJECTID; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = 0; - - fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true); + fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); @@ -3508,10 +3518,9 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); - crypto_shash_init(shash); - crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - crypto_shash_final(shash, sb->csum); + crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, + sb->csum); page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index cd629113f61c..bf43245406c4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,7 +25,6 @@ enum btrfs_wq_endio_type { BTRFS_WQ_ENDIO_METADATA, BTRFS_WQ_ENDIO_FREE_SPACE, BTRFS_WQ_ENDIO_RAID56, - BTRFS_WQ_ENDIO_DIO_REPAIR, }; static inline u64 btrfs_sb_offset(int mirror) @@ -67,8 +66,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *key, - bool check_ref); + u64 objectid, bool check_ref); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 2bb25d2dc44b..1a8d419d9e1f 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -64,24 +64,15 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; struct inode *inode; - struct btrfs_key key; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); - key.objectid = root_objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - root = btrfs_get_fs_root(fs_info, &key, true); + root = btrfs_get_fs_root(fs_info, root_objectid, true); if (IS_ERR(root)) return ERR_CAST(root); - key.objectid = objectid; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(sb, &key, root); + inode = btrfs_iget(sb, objectid, root); btrfs_put_root(root); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -200,9 +191,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0, 0); } - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root)); + return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root)); fail: btrfs_free_path(path); return ERR_PTR(ret); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index b4a7bad3e82e..b6561455b3c4 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -44,6 +44,7 @@ enum { IO_TREE_TRANS_DIRTY_PAGES, IO_TREE_ROOT_DIRTY_LOG_PAGES, IO_TREE_INODE_FILE_EXTENT, + IO_TREE_LOG_CSUM_RANGE, IO_TREE_SELFTEST, }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 54a64d1e18c6..c0bc35f932bf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2114,22 +2114,6 @@ static u64 find_middle(struct rb_root *root) } #endif -static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) -{ - u64 num_bytes; - - num_bytes = heads * (sizeof(struct btrfs_extent_item) + - sizeof(struct btrfs_extent_inline_ref)); - if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - num_bytes += heads * sizeof(struct btrfs_tree_block_info); - - /* - * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to use. - */ - return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); -} - /* * Takes the number of bytes to be csumm'ed and figures out how many leaves it * would require to store the csums for that many bytes. @@ -2442,7 +2426,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0) + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) return 0; if (full_backref) @@ -2932,7 +2916,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) &trimmed); list_del_init(&block_group->bg_list); - btrfs_put_block_group_trimming(block_group); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); if (ret) { @@ -3369,6 +3353,7 @@ static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, int delalloc) + __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; @@ -5501,8 +5486,6 @@ out: */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); - if (err && err != -EAGAIN) - btrfs_handle_fs_error(fs_info, err, NULL); return err; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e12eb32d9e17..68c96057ad2d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2333,7 +2333,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, return 0; } -int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num) +int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 start = eb->start; @@ -2537,8 +2537,9 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, return 0; } -bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, - struct io_failure_record *failrec, int failed_mirror) +static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, + struct io_failure_record *failrec, + int failed_mirror) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int num_copies; @@ -2561,7 +2562,7 @@ bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, * a) deliver good data to the caller * b) correct the bad sectors on disk */ - if (failed_bio_pages > 1) { + if (needs_validation) { /* * to fulfill b), we need to know the exact failing sectors, as * we don't want to rewrite any more than the failed ones. thus, @@ -2600,94 +2601,115 @@ bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, return true; } - -struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, - struct io_failure_record *failrec, - struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func, void *data) +static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct bio *bio; - struct btrfs_io_bio *btrfs_failed_bio; - struct btrfs_io_bio *btrfs_bio; + u64 len = 0; + const u32 blocksize = inode->i_sb->s_blocksize; - bio = btrfs_io_bio_alloc(1); - bio->bi_end_io = endio_func; - bio->bi_iter.bi_sector = failrec->logical >> 9; - bio->bi_iter.bi_size = 0; - bio->bi_private = data; + /* + * If bi_status is BLK_STS_OK, then this was a checksum error, not an + * I/O error. In this case, we already know exactly which sector was + * bad, so we don't need to validate. + */ + if (bio->bi_status == BLK_STS_OK) + return false; - btrfs_failed_bio = btrfs_io_bio(failed_bio); - if (btrfs_failed_bio->csum) { - u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + /* + * We need to validate each sector individually if the failed I/O was + * for multiple sectors. + * + * There are a few possible bios that can end up here: + * 1. A buffered read bio, which is not cloned. + * 2. A direct I/O read bio, which is cloned. + * 3. A (buffered or direct) repair bio, which is not cloned. + * + * For cloned bios (case 2), we can get the size from + * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get + * it from the bvecs. + */ + if (bio_flagged(bio, BIO_CLONED)) { + if (btrfs_io_bio(bio)->iter.bi_size > blocksize) + return true; + } else { + struct bio_vec *bvec; + int i; - btrfs_bio = btrfs_io_bio(bio); - btrfs_bio->csum = btrfs_bio->csum_inline; - icsum *= csum_size; - memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, - csum_size); + bio_for_each_bvec_all(bvec, bio, i) { + len += bvec->bv_len; + if (len > blocksize) + return true; + } } - - bio_add_page(bio, page, failrec->len, pg_offset); - - return bio; + return false; } -/* - * This is a generic handler for readpage errors. If other copies exist, read - * those and write back good data to the failed position. Does not investigate - * in remapping the failed extent elsewhere, hoping the device will be smart - * enough to do this as needed - */ -static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int failed_mirror) +blk_status_t btrfs_submit_read_repair(struct inode *inode, + struct bio *failed_bio, u64 phy_offset, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + submit_bio_hook_t *submit_bio_hook) { struct io_failure_record *failrec; - struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct bio *bio; - int read_mode = 0; + struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); + const int icsum = phy_offset >> inode->i_sb->s_blocksize_bits; + bool need_validation; + struct bio *repair_bio; + struct btrfs_io_bio *repair_io_bio; blk_status_t status; int ret; - unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT; + + btrfs_debug(fs_info, + "repair read error: read error at %llu", start); BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) - return ret; + return errno_to_blk_status(ret); + + need_validation = btrfs_io_needs_validation(inode, failed_bio); - if (!btrfs_check_repairable(inode, failed_bio_pages, failrec, + if (!btrfs_check_repairable(inode, need_validation, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); - return -EIO; + return BLK_STS_IOERR; } - if (failed_bio_pages > 1) - read_mode |= REQ_FAILFAST_DEV; + repair_bio = btrfs_io_bio_alloc(1); + repair_io_bio = btrfs_io_bio(repair_bio); + repair_bio->bi_opf = REQ_OP_READ; + if (need_validation) + repair_bio->bi_opf |= REQ_FAILFAST_DEV; + repair_bio->bi_end_io = failed_bio->bi_end_io; + repair_bio->bi_iter.bi_sector = failrec->logical >> 9; + repair_bio->bi_private = failed_bio->bi_private; - phy_offset >>= inode->i_sb->s_blocksize_bits; - bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, - start - page_offset(page), - (int)phy_offset, failed_bio->bi_end_io, - NULL); - bio->bi_opf = REQ_OP_READ | read_mode; + if (failed_io_bio->csum) { + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + + repair_io_bio->csum = repair_io_bio->csum_inline; + memcpy(repair_io_bio->csum, + failed_io_bio->csum + csum_size * icsum, csum_size); + } + + bio_add_page(repair_bio, page, failrec->len, pgoff); + repair_io_bio->logical = failrec->start; + repair_io_bio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), - "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d", - read_mode, failrec->this_mirror, failrec->in_validation); +"repair read error: submitting new read to mirror %d, in_validation=%d", + failrec->this_mirror, failrec->in_validation); - status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror, - failrec->bio_flags); + status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, + failrec->bio_flags); if (status) { free_io_failure(failure_tree, tree, failrec); - bio_put(bio); - ret = blk_status_to_errno(status); + bio_put(repair_bio); } - - return ret; + return status; } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2859,9 +2881,10 @@ static void end_bio_extent_readpage(struct bio *bio) * If it can't handle the error it will return -EIO and * we remain responsible for that page. */ - ret = bio_readpage_error(bio, offset, page, start, end, - mirror); - if (ret == 0) { + if (!btrfs_submit_read_repair(inode, bio, offset, page, + start - page_offset(page), + start, end, mirror, + tree->ops->submit_bio_hook)) { uptodate = !bio->bi_status; offset += len; continue; @@ -4862,7 +4885,7 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -int extent_buffer_under_io(struct extent_buffer *eb) +int extent_buffer_under_io(const struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || @@ -4967,7 +4990,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, return eb; } -struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) +struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; struct page *p; @@ -5373,7 +5396,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -void clear_extent_buffer_dirty(struct extent_buffer *eb) +void clear_extent_buffer_dirty(const struct extent_buffer *eb) { int i; int num_pages; @@ -5571,8 +5594,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; if (start + len > eb->len) { WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", @@ -5581,7 +5603,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, return; } - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5606,14 +5628,13 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5634,48 +5655,6 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, return ret; } -/* - * return 0 if the item is found within a page. - * return 1 if the item spans two pages. - * return -EINVAL otherwise. - */ -int map_private_extent_buffer(const struct extent_buffer *eb, - unsigned long start, unsigned long min_len, - char **map, unsigned long *map_start, - unsigned long *map_len) -{ - size_t offset; - char *kaddr; - struct page *p; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; - unsigned long end_i = (start_offset + start + min_len - 1) >> - PAGE_SHIFT; - - if (start + min_len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, min_len); - return -EINVAL; - } - - if (i != end_i) - return 1; - - if (i == 0) { - offset = start_offset; - *map_start = 0; - } else { - offset = 0; - *map_start = ((u64)i << PAGE_SHIFT) - start_offset; - } - - p = eb->pages[i]; - kaddr = page_address(p); - *map = kaddr + offset; - *map_len = PAGE_SIZE - offset; - return 0; -} - int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { @@ -5684,14 +5663,13 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5711,7 +5689,7 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, return ret; } -void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, +void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; @@ -5722,7 +5700,7 @@ void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, BTRFS_FSID_SIZE); } -void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) +void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; @@ -5732,7 +5710,7 @@ void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) BTRFS_FSID_SIZE); } -void write_extent_buffer(struct extent_buffer *eb, const void *srcv, +void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len) { size_t cur; @@ -5740,13 +5718,12 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5763,20 +5740,19 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, } } -void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, unsigned long len) { size_t cur; size_t offset; struct page *page; char *kaddr; - size_t start_offset = offset_in_page(eb->start); - unsigned long i = (start_offset + start) >> PAGE_SHIFT; + unsigned long i = start >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = offset_in_page(start_offset + start); + offset = offset_in_page(start); while (len > 0) { page = eb->pages[i]; @@ -5792,8 +5768,8 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, } } -void copy_extent_buffer_full(struct extent_buffer *dst, - struct extent_buffer *src) +void copy_extent_buffer_full(const struct extent_buffer *dst, + const struct extent_buffer *src) { int i; int num_pages; @@ -5806,7 +5782,8 @@ void copy_extent_buffer_full(struct extent_buffer *dst, page_address(src->pages[i])); } -void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, +void copy_extent_buffer(const struct extent_buffer *dst, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { @@ -5815,12 +5792,11 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, size_t offset; struct page *page; char *kaddr; - size_t start_offset = offset_in_page(dst->start); - unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; + unsigned long i = dst_offset >> PAGE_SHIFT; WARN_ON(src->len != dst_len); - offset = offset_in_page(start_offset + dst_offset); + offset = offset_in_page(dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5851,12 +5827,11 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. */ -static inline void eb_bitmap_offset(struct extent_buffer *eb, +static inline void eb_bitmap_offset(const struct extent_buffer *eb, unsigned long start, unsigned long nr, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = offset_in_page(eb->start); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -5865,7 +5840,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start_offset + start + byte_offset; + offset = start + byte_offset; *page_index = offset >> PAGE_SHIFT; *page_offset = offset_in_page(offset); @@ -5877,7 +5852,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, * @start: offset of the bitmap item in the extent buffer * @nr: bit number to test */ -int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, +int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) { u8 *kaddr; @@ -5899,7 +5874,7 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, * @pos: bit number of the first bit * @len: number of bits to set */ -void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, +void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { u8 *kaddr; @@ -5941,8 +5916,9 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, * @pos: bit number of the first bit * @len: number of bits to clear */ -void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, - unsigned long pos, unsigned long len) +void extent_buffer_bitmap_clear(const struct extent_buffer *eb, + unsigned long start, unsigned long pos, + unsigned long len) { u8 *kaddr; struct page *page; @@ -6003,14 +5979,14 @@ static void copy_pages(struct page *dst_page, struct page *src_page, memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); } -void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) +void memcpy_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) { struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -6028,11 +6004,11 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } while (len > 0) { - dst_off_in_page = offset_in_page(start_offset + dst_offset); - src_off_in_page = offset_in_page(start_offset + src_offset); + dst_off_in_page = offset_in_page(dst_offset); + src_off_in_page = offset_in_page(src_offset); - dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; - src_i = (start_offset + src_offset) >> PAGE_SHIFT; + dst_i = dst_offset >> PAGE_SHIFT; + src_i = src_offset >> PAGE_SHIFT; cur = min(len, (unsigned long)(PAGE_SIZE - src_off_in_page)); @@ -6048,8 +6024,9 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len) +void memmove_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) { struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; @@ -6057,7 +6034,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -6078,11 +6054,11 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, return; } while (len > 0) { - dst_i = (start_offset + dst_end) >> PAGE_SHIFT; - src_i = (start_offset + src_end) >> PAGE_SHIFT; + dst_i = dst_end >> PAGE_SHIFT; + src_i = src_end >> PAGE_SHIFT; - dst_off_in_page = offset_in_page(start_offset + dst_end); - src_off_in_page = offset_in_page(start_offset + src_end); + dst_off_in_page = offset_in_page(dst_end); + src_off_in_page = offset_in_page(src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 25594e09fdcd..602bf3af9fb4 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -66,6 +66,10 @@ struct btrfs_io_bio; struct io_failure_record; struct extent_io_tree; +typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags); + typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); @@ -74,8 +78,7 @@ struct extent_io_ops { * The following callbacks must be always defined, the function * pointer will be called unconditionally. */ - blk_status_t (*submit_bio_hook)(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); + submit_bio_hook_t *submit_bio_hook; int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); @@ -209,7 +212,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); -struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); +struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); void free_extent_buffer(struct extent_buffer *eb); @@ -227,7 +230,7 @@ static inline int num_extent_pages(const struct extent_buffer *eb) (eb->start >> PAGE_SHIFT); } -static inline int extent_buffer_uptodate(struct extent_buffer *eb) +static inline int extent_buffer_uptodate(const struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } @@ -240,37 +243,37 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dst, int read_extent_buffer_to_user(const struct extent_buffer *eb, void __user *dst, unsigned long start, unsigned long len); -void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src); -void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, +void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); +void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *src); -void write_extent_buffer(struct extent_buffer *eb, const void *src, +void write_extent_buffer(const struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); -void copy_extent_buffer_full(struct extent_buffer *dst, - struct extent_buffer *src); -void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, +void copy_extent_buffer_full(const struct extent_buffer *dst, + const struct extent_buffer *src); +void copy_extent_buffer(const struct extent_buffer *dst, + const struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len); -void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len); -void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, - unsigned long src_offset, unsigned long len); -void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, +void memcpy_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memmove_extent_buffer(const struct extent_buffer *dst, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, unsigned long len); -int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, +int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long pos); -void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, +void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, - unsigned long pos, unsigned long len); -void clear_extent_buffer_dirty(struct extent_buffer *eb); +void extent_buffer_bitmap_clear(const struct extent_buffer *eb, + unsigned long start, unsigned long pos, + unsigned long len); +void clear_extent_buffer_dirty(const struct extent_buffer *eb); bool set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); -int extent_buffer_under_io(struct extent_buffer *eb); -int map_private_extent_buffer(const struct extent_buffer *eb, - unsigned long offset, unsigned long min_len, - char **map, unsigned long *map_start, - unsigned long *map_len); +int extent_buffer_under_io(const struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, @@ -289,7 +292,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num); +int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we @@ -311,12 +314,12 @@ struct io_failure_record { }; -bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages, - struct io_failure_record *failrec, int fail_mirror); -struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, - struct io_failure_record *failrec, - struct page *page, int pg_offset, int icsum, - bio_end_io_t *endio_func, void *data); +blk_status_t btrfs_submit_read_repair(struct inode *inode, + struct bio *failed_bio, u64 phy_offset, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + submit_bio_hook_t *submit_bio_hook); + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b618ad5339ba..706a3128e192 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -242,11 +242,13 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, /** * btrfs_lookup_bio_sums - Look up checksums for a bio. * @inode: inode that the bio is for. - * @bio: bio embedded in btrfs_io_bio. + * @bio: bio to look up. * @offset: Unless (u64)-1, look up checksums for this offset in the file. * If (u64)-1, use the page offsets from the bio instead. - * @dst: Buffer of size btrfs_super_csum_size() used to return checksum. If - * NULL, the checksum is returned in btrfs_io_bio(bio)->csum instead. + * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return + * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If + * NULL, the checksum buffer is allocated and returned in + * btrfs_io_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ @@ -256,7 +258,6 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; struct bvec_iter iter; - struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; @@ -277,6 +278,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { + struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { btrfs_bio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); @@ -598,13 +601,12 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, index = 0; } - crypto_shash_init(shash); data = kmap_atomic(bvec.bv_page); - crypto_shash_update(shash, data + bvec.bv_offset + crypto_shash_digest(shash, data + bvec.bv_offset + (i * fs_info->sectorsize), - fs_info->sectorsize); + fs_info->sectorsize, + sums->sums + index); kunmap_atomic(data); - crypto_shash_final(shash, (char *)(sums->sums + index)); index += csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; @@ -869,7 +871,7 @@ again: } ret = PTR_ERR(item); if (ret != -EFBIG && ret != -ENOENT) - goto fail_unlock; + goto out; if (ret == -EFBIG) { u32 item_size; @@ -887,10 +889,12 @@ again: nritems = btrfs_header_nritems(path->nodes[0]); if (!nritems || (path->slots[0] >= nritems - 1)) { ret = btrfs_next_leaf(root, path); - if (ret == 1) + if (ret < 0) { + goto out; + } else if (ret > 0) { found_next = 1; - if (ret != 0) goto insert; + } slot = path->slots[0]; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); @@ -905,14 +909,27 @@ again: } /* - * at this point, we know the tree has an item, but it isn't big - * enough yet to put our csum in. Grow it + * At this point, we know the tree has a checksum item that ends at an + * offset matching the start of the checksum range we want to insert. + * We try to extend that item as much as possible and then add as many + * checksums to it as they fit. + * + * First check if the leaf has enough free space for at least one + * checksum. If it has go directly to the item extension code, otherwise + * release the path and do a search for insertion before the extension. */ + if (btrfs_leaf_free_space(leaf) >= csum_size) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> + fs_info->sb->s_blocksize_bits; + goto extend_csum; + } + btrfs_release_path(path); ret = btrfs_search_slot(trans, root, &file_key, path, csum_size, 1); if (ret < 0) - goto fail_unlock; + goto out; if (ret > 0) { if (path->slots[0] == 0) @@ -931,19 +948,13 @@ again: goto insert; } +extend_csum: if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / csum_size) { int extend_nr; u64 tmp; u32 diff; - u32 free_space; - - if (btrfs_leaf_free_space(leaf) < - sizeof(struct btrfs_item) + csum_size * 2) - goto insert; - free_space = btrfs_leaf_free_space(leaf) - - sizeof(struct btrfs_item) - csum_size; tmp = sums->len - total_bytes; tmp >>= fs_info->sb->s_blocksize_bits; WARN_ON(tmp < 1); @@ -954,7 +965,7 @@ again: MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); - diff = min(free_space, diff); + diff = min_t(u32, btrfs_leaf_free_space(leaf), diff); diff /= csum_size; diff *= csum_size; @@ -985,9 +996,9 @@ insert: ins_size); path->leave_spinning = 0; if (ret < 0) - goto fail_unlock; + goto out; if (WARN_ON(ret != 0)) - goto fail_unlock; + goto out; leaf = path->nodes[0]; csum: item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); @@ -1017,9 +1028,6 @@ found: out: btrfs_free_path(path); return ret; - -fail_unlock: - goto out; } void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 719e68ab552c..fde125616687 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -275,26 +275,18 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, { struct btrfs_root *inode_root; struct inode *inode; - struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; int ret; /* get the inode */ - key.objectid = defrag->root; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - inode_root = btrfs_get_fs_root(fs_info, &key, true); + inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); if (IS_ERR(inode_root)) { ret = PTR_ERR(inode_root); goto cleanup; } - key.objectid = defrag->ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, inode_root); + inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); btrfs_put_root(inode_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -775,7 +767,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent) modify_tree = 0; - update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || root == fs_info->tree_root); while (1) { recow = 0; @@ -1817,21 +1809,61 @@ again: return num_written ? num_written : ret; } -static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) +static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + const unsigned int blocksize_mask = fs_info->sectorsize - 1; + + if (offset & blocksize_mask) + return -EINVAL; + + if (iov_iter_alignment(iter) & blocksize_mask) + return -EINVAL; + + return 0; +} + +static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - loff_t pos; - ssize_t written; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + loff_t pos = iocb->ki_pos; + ssize_t written = 0; ssize_t written_buffered; loff_t endbyte; int err; + size_t count = 0; + bool relock = false; - written = generic_file_direct_write(iocb, from); + if (check_direct_IO(fs_info, from, pos)) + goto buffered; + + count = iov_iter_count(from); + /* + * If the write DIO is beyond the EOF, we need update the isize, but it + * is protected by i_mutex. So we can not unlock the i_mutex at this + * case. + */ + if (pos + count <= inode->i_size) { + inode_unlock(inode); + relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + return -EAGAIN; + } + + down_read(&BTRFS_I(inode)->dio_sem); + written = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dops, + is_sync_kiocb(iocb)); + up_read(&BTRFS_I(inode)->dio_sem); + + if (relock) + inode_lock(inode); if (written < 0 || !iov_iter_count(from)) return written; +buffered: pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { @@ -1970,7 +2002,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { - num_written = __btrfs_direct_write(iocb, from); + num_written = btrfs_direct_write(iocb, from); } else { num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) @@ -3484,9 +3516,54 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static int check_direct_read(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + int ret; + int i, seg; + + ret = check_direct_IO(fs_info, iter, offset); + if (ret < 0) + return ret; + + for (seg = 0; seg < iter->nr_segs; seg++) + for (i = seg + 1; i < iter->nr_segs; i++) + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + return -EINVAL; + return 0; +} + +static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) + return 0; + + inode_lock_shared(inode); + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dops, + is_sync_kiocb(iocb)); + inode_unlock_shared(inode); + return ret; +} + +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = btrfs_direct_read(iocb, to); + if (ret < 0) + return ret; + } + + return generic_file_buffered_read(iocb, to, ret); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = btrfs_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .mmap = btrfs_file_mmap, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3613da065a73..55955bd424d7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -82,7 +82,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, * sure NOFS is set to keep us from deadlocking. */ nofs_flag = memalloc_nofs_save(); - inode = btrfs_iget_path(fs_info->sb, &location, root, path); + inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path); btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) @@ -1190,13 +1190,10 @@ out: if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; - if (block_group) { -#ifdef CONFIG_BTRFS_DEBUG - btrfs_err(root->fs_info, - "failed to write free space cache for block group %llu", - block_group->start); -#endif - } + if (block_group) + btrfs_debug(root->fs_info, + "failed to write free space cache for block group %llu error %d", + block_group->start, ret); } btrfs_update_inode(trans, root, inode); @@ -1415,11 +1412,9 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, block_group, &block_group->io_ctl, trans); if (ret) { -#ifdef CONFIG_BTRFS_DEBUG - btrfs_err(fs_info, - "failed to write free space cache for block group %llu", - block_group->start); -#endif + btrfs_debug(fs_info, + "failed to write free space cache for block group %llu error %d", + block_group->start, ret); spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); @@ -3762,46 +3757,6 @@ out: return ret; } -void btrfs_get_block_group_trimming(struct btrfs_block_group *cache) -{ - atomic_inc(&cache->trimming); -} - -void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - struct extent_map_tree *em_tree; - struct extent_map *em; - bool cleanup; - - spin_lock(&block_group->lock); - cleanup = (atomic_dec_and_test(&block_group->trimming) && - block_group->removed); - spin_unlock(&block_group->lock); - - if (cleanup) { - mutex_lock(&fs_info->chunk_mutex); - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, block_group->start, - 1); - BUG_ON(!em); /* logic error, can't happen */ - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - mutex_unlock(&fs_info->chunk_mutex); - - /* once for us and once for the tree */ - free_extent_map(em); - free_extent_map(em); - - /* - * We've left one free space entry and other tasks trimming - * this block group have left 1 entry each one. Free them. - */ - __btrfs_remove_free_space_cache(block_group->free_space_ctl); - } -} - int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen) { @@ -3816,7 +3771,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, spin_unlock(&block_group->lock); return 0; } - btrfs_get_block_group_trimming(block_group); + btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); @@ -3829,7 +3784,7 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, if (rem) reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); out: - btrfs_put_block_group_trimming(block_group); + btrfs_unfreeze_block_group(block_group); return ret; } @@ -3846,11 +3801,11 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, spin_unlock(&block_group->lock); return 0; } - btrfs_get_block_group_trimming(block_group); + btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); - btrfs_put_block_group_trimming(block_group); + btrfs_unfreeze_block_group(block_group); return ret; } @@ -3868,13 +3823,13 @@ int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, spin_unlock(&block_group->lock); return 0; } - btrfs_get_block_group_trimming(block_group); + btrfs_freeze_block_group(block_group); spin_unlock(&block_group->lock); ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, async); - btrfs_put_block_group_trimming(block_group); + btrfs_unfreeze_block_group(block_group); return ret; } @@ -4035,11 +3990,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (release_metadata) btrfs_delalloc_release_metadata(BTRFS_I(inode), inode->i_size, true); -#ifdef CONFIG_BTRFS_DEBUG - btrfs_err(fs_info, - "failed to write free ino cache for root %llu", - root->root_key.objectid); -#endif + btrfs_debug(fs_info, + "failed to write free ino cache for root %llu error %d", + root->root_key.objectid, ret); } return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8b3489f229c7..768c8be4c765 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5,7 +5,6 @@ #include <linux/kernel.h> #include <linux/bio.h> -#include <linux/buffer_head.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -49,17 +48,18 @@ #include "qgroup.h" #include "delalloc-space.h" #include "block-group.h" +#include "space-info.h" struct btrfs_iget_args { - struct btrfs_key *location; + u64 ino; struct btrfs_root *root; }; struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -1142,7 +1142,7 @@ out_unlock: */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, - start + cur_alloc_size, + start + cur_alloc_size - 1, locked_page, clear_bits, page_ops); @@ -1355,6 +1355,66 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, return 1; } +static int fallback_to_cow(struct inode *inode, struct page *locked_page, + const u64 start, const u64 end, + int *page_started, unsigned long *nr_written) +{ + const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode)); + const u64 range_bytes = end + 1 - start; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 range_start = start; + u64 count; + + /* + * If EXTENT_NORESERVE is set it means that when the buffered write was + * made we had not enough available data space and therefore we did not + * reserve data space for it, since we though we could do NOCOW for the + * respective file range (either there is prealloc extent or the inode + * has the NOCOW bit set). + * + * However when we need to fallback to COW mode (because for example the + * block group for the corresponding extent was turned to RO mode by a + * scrub or relocation) we need to do the following: + * + * 1) We increment the bytes_may_use counter of the data space info. + * If COW succeeds, it allocates a new data extent and after doing + * that it decrements the space info's bytes_may_use counter and + * increments its bytes_reserved counter by the same amount (we do + * this at btrfs_add_reserved_bytes()). So we need to increment the + * bytes_may_use counter to compensate (when space is reserved at + * buffered write time, the bytes_may_use counter is incremented); + * + * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so + * that if the COW path fails for any reason, it decrements (through + * extent_clear_unlock_delalloc()) the bytes_may_use counter of the + * data space info, which we incremented in the step above. + * + * If we need to fallback to cow and the inode corresponds to a free + * space cache inode, we must also increment bytes_may_use of the data + * space_info for the same reason. Space caches always get a prealloc + * extent for them, however scrub or balance may have set the block + * group that contains that extent to RO mode. + */ + count = count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0); + if (count > 0 || is_space_ino) { + const u64 bytes = is_space_ino ? range_bytes : count; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_space_info *sinfo = fs_info->data_sinfo; + + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + spin_unlock(&sinfo->lock); + + if (count > 0) + clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, + 0, 0, NULL); + } + + return cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 1); +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -1602,9 +1662,9 @@ out_check: * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written, 1); + ret = fallback_to_cow(inode, locked_page, cow_start, + found_key.offset - 1, + page_started, nr_written); if (ret) { if (nocow) btrfs_dec_nocow_writers(fs_info, @@ -1693,8 +1753,8 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = fallback_to_cow(inode, locked_page, cow_start, end, + page_started, nr_written); if (ret) goto error; } @@ -2726,10 +2786,9 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, btrfs_queue_work(wq, &ordered_extent->work); } -static int __readpage_endio_check(struct inode *inode, - struct btrfs_io_bio *io_bio, - int icsum, struct page *page, - int pgoff, u64 start, size_t len) +static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, + int icsum, struct page *page, int pgoff, u64 start, + size_t len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -2743,9 +2802,7 @@ static int __readpage_endio_check(struct inode *inode, kaddr = kmap_atomic(page); shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - crypto_shash_update(shash, kaddr + pgoff, len); - crypto_shash_final(shash, csum); + crypto_shash_digest(shash, kaddr + pgoff, len, csum); if (memcmp(csum, csum_expected, csum_size)) goto zeroit; @@ -2790,8 +2847,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, } phy_offset >>= inode->i_sb->s_blocksize_bits; - return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, - start, (size_t)(end - start + 1)); + return check_data_csum(inode, io_bio, phy_offset, page, offset, start, + (size_t)(end - start + 1)); } /* @@ -2981,7 +3038,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(fs_info->sb, &found_key, root); + inode = btrfs_iget(fs_info->sb, last_objectid, root); ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ENOENT) goto out; @@ -3000,18 +3057,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * orphan must not get deleted. * find_dead_roots already ran before us, so if this * is a snapshot deletion, we should find the root - * in the dead_roots list + * in the fs_roots radix tree. */ - spin_lock(&fs_info->trans_lock); - list_for_each_entry(dead_root, &fs_info->dead_roots, - root_list) { - if (dead_root->root_key.objectid == - found_key.objectid) { - is_dead_root = 1; - break; - } - } - spin_unlock(&fs_info->trans_lock); + + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); + if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) + is_dead_root = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + if (is_dead_root) { /* prevent this orphan from being found again */ key.offset = found_key.objectid - 1; @@ -3357,43 +3412,40 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_init_map_token(&token, leaf); - btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); - btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); - btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, - &token); - btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); - btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - - btrfs_set_token_timespec_sec(leaf, &item->atime, - inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->atime, - inode->i_atime.tv_nsec, &token); - - btrfs_set_token_timespec_sec(leaf, &item->mtime, - inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->mtime, - inode->i_mtime.tv_nsec, &token); - - btrfs_set_token_timespec_sec(leaf, &item->ctime, - inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->ctime, - inode->i_ctime.tv_nsec, &token); - - btrfs_set_token_timespec_sec(leaf, &item->otime, - BTRFS_I(inode)->i_otime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->otime, - BTRFS_I(inode)->i_otime.tv_nsec, &token); - - btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), - &token); - btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, - &token); - btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode), - &token); - btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); - btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); - btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); - btrfs_set_token_inode_block_group(leaf, item, 0, &token); + btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); + btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); + btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_token_inode_mode(&token, item, inode->i_mode); + btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); + + btrfs_set_token_timespec_sec(&token, &item->atime, + inode->i_atime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->atime, + inode->i_atime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->mtime, + inode->i_mtime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->ctime, + inode->i_ctime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->ctime, + inode->i_ctime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); + + btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + btrfs_set_token_inode_generation(&token, item, + BTRFS_I(inode)->generation); + btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); + btrfs_set_token_inode_transid(&token, item, trans->transid); + btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + btrfs_set_token_inode_block_group(&token, item, 0); } /* @@ -3618,7 +3670,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) * 1 for the inode ref * 1 for the inode */ - return btrfs_start_transaction_fallback_global_rsv(root, 5, 5); + return btrfs_start_transaction_fallback_global_rsv(root, 5); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4108,11 +4160,12 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); /* - * for non-free space inodes and ref cows, we want to back off from - * time to time + * For non-free space inodes and non-shareable roots, we want to back + * off from time to time. This means all inodes in subvolume roots, + * reloc roots, and data reloc roots. */ if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && - test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) be_nice = true; path = btrfs_alloc_path(); @@ -4120,20 +4173,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = READA_BACK; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, &cached_state); - /* - * We want to drop from the next block forward in case this new size is - * not block aligned since we will be keeping the last block of the - * extent just the way it is. - */ - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == fs_info->tree_root) + /* + * We want to drop from the next block forward in case this + * new size is not block aligned since we will be keeping the + * last block of the extent just the way it is. + */ btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size, fs_info->sectorsize), (u64)-1, 0); + } /* * This function is also used to drop the items in the log tree before @@ -4241,7 +4293,7 @@ search_again: extent_num_bytes); num_dec = (orig_num_bytes - extent_num_bytes); - if (test_bit(BTRFS_ROOT_REF_COWS, + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && extent_start != 0) inode_sub_bytes(inode, num_dec); @@ -4257,7 +4309,7 @@ search_again: num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { found_extent = 1; - if (test_bit(BTRFS_ROOT_REF_COWS, + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(inode, num_dec); } @@ -4293,7 +4345,7 @@ search_again: clear_len = fs_info->sectorsize; } - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); } delete: @@ -4334,8 +4386,7 @@ delete: should_throttle = false; if (found_extent && - (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == fs_info->tree_root)) { + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref ref = { 0 }; bytes_deleted += extent_num_bytes; @@ -4759,10 +4810,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@ -5154,7 +5202,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, btrfs_release_path(path); - new_root = btrfs_get_fs_root(fs_info, location, true); + new_root = btrfs_get_fs_root(fs_info, location->objectid, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; @@ -5232,9 +5280,11 @@ static void inode_tree_del(struct inode *inode) static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; - inode->i_ino = args->location->objectid; - memcpy(&BTRFS_I(inode)->location, args->location, - sizeof(*args->location)); + + inode->i_ino = args->ino; + BTRFS_I(inode)->location.objectid = args->ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); BUG_ON(args->root && !BTRFS_I(inode)->root); return 0; @@ -5243,19 +5293,19 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; - return args->location->objectid == BTRFS_I(inode)->location.objectid && + + return args->ino == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(struct super_block *s, - struct btrfs_key *location, +static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; - unsigned long hashval = btrfs_inode_hash(location->objectid, root); + unsigned long hashval = btrfs_inode_hash(ino, root); - args.location = location; + args.ino = ino; args.root = root; inode = iget5_locked(s, hashval, btrfs_find_actor, @@ -5265,17 +5315,17 @@ static struct inode *btrfs_iget_locked(struct super_block *s, } /* - * Get an inode object given its location and corresponding root. + * Get an inode object given its inode number and corresponding root. * Path can be preallocated to prevent recursing back to iget through * allocator. NULL is also valid but may require an additional allocation * later. */ -struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path) { struct inode *inode; - inode = btrfs_iget_locked(s, location, root); + inode = btrfs_iget_locked(s, ino, root); if (!inode) return ERR_PTR(-ENOMEM); @@ -5302,10 +5352,9 @@ struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, return inode; } -struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root) +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) { - return btrfs_iget_path(s, location, root, NULL); + return btrfs_iget_path(s, ino, root, NULL); } static struct inode *new_simple_dir(struct super_block *s, @@ -5374,7 +5423,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { - inode = btrfs_iget(dir->i_sb, &location, root); + inode = btrfs_iget(dir->i_sb, location.objectid, root); if (IS_ERR(inode)) return inode; @@ -5398,7 +5447,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) else inode = new_simple_dir(dir->i_sb, &location, sub_root); } else { - inode = btrfs_iget(dir->i_sb, &location, sub_root); + inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); } if (root != sub_root) btrfs_put_root(sub_root); @@ -5779,7 +5828,8 @@ int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; - args.location = &BTRFS_I(inode)->location; + + args.ino = BTRFS_I(inode)->location.objectid; args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, @@ -6991,7 +7041,7 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7129,30 +7179,7 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, } -static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - return 0; -} - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7214,7 +7241,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(inode, start, len); if (IS_ERR(em)) { @@ -7225,64 +7251,73 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; - if (!create) + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(inode, + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } } + iomap->private = dio_data; + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } @@ -7314,36 +7349,48 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; } + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; + free_extent_map(em); return 0; @@ -7352,370 +7399,152 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(inode, dio_data->data_reserved, + start, dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } return ret; } -static inline blk_status_t submit_dio_repair_bio(struct inode *inode, - struct bio *bio, - int mirror_num) +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); - BUG_ON(bio_op(bio) == REQ_OP_WRITE); + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR); - if (ret) - return ret; + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(inode, pos, length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(inode, + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } +out: + kfree(dio_data); + iomap->private = NULL; return ret; } -static int btrfs_check_dio_repairable(struct inode *inode, - struct bio *failed_bio, - struct io_failure_record *failrec, - int failed_mirror) +static void btrfs_dio_private_put(struct btrfs_dio_private *dip) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return 0; - } - - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; + /* + * This implies a barrier so that stores to dio_bio->bi_status before + * this and loads of dio_bio->bi_status after this are fully ordered. + */ + if (!refcount_dec_and_test(&dip->refs)) + return; - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return 0; + if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { + __endio_write_update_ordered(dip->inode, dip->logical_offset, + dip->bytes, + !dip->dio_bio->bi_status); + } else { + unlock_extent(&BTRFS_I(dip->inode)->io_tree, + dip->logical_offset, + dip->logical_offset + dip->bytes - 1); } - return 1; + bio_endio(dip->dio_bio); + kfree(dip); } -static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - bio_end_io_t *repair_endio, void *repair_arg) +static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + unsigned long bio_flags) { - struct io_failure_record *failrec; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct bio *bio; - int isector; - unsigned int read_mode = 0; - int segs; - int ret; - blk_status_t status; - struct bio_vec bvec; + struct btrfs_dio_private *dip = bio->bi_private; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + blk_status_t ret; - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + BUG_ON(bio_op(bio) == REQ_OP_WRITE); - ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) - return errno_to_blk_status(ret); - - ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, - failed_mirror); - if (!ret) { - free_io_failure(failure_tree, io_tree, failrec); - return BLK_STS_IOERR; - } - - segs = bio_segments(failed_bio); - bio_get_first_bvec(failed_bio, &bvec); - if (segs > 1 || - (bvec.bv_len > btrfs_inode_sectorsize(inode))) - read_mode |= REQ_FAILFAST_DEV; - - isector = start - btrfs_io_bio(failed_bio)->logical; - isector >>= inode->i_sb->s_blocksize_bits; - bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, - pgoff, isector, repair_endio, repair_arg); - bio->bi_opf = REQ_OP_READ | read_mode; - - btrfs_debug(BTRFS_I(inode)->root->fs_info, - "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d", - read_mode, failrec->this_mirror, failrec->in_validation); - - status = submit_dio_repair_bio(inode, bio, failrec->this_mirror); - if (status) { - free_io_failure(failure_tree, io_tree, failrec); - bio_put(bio); - } - - return status; -} - -struct btrfs_retry_complete { - struct completion done; - struct inode *inode; - u64 start; - int uptodate; -}; + return ret; -static void btrfs_retry_endio_nocsum(struct bio *bio) -{ - struct btrfs_retry_complete *done = bio->bi_private; - struct inode *inode = done->inode; - struct bio_vec *bvec; - struct extent_io_tree *io_tree, *failure_tree; - struct bvec_iter_all iter_all; - - if (bio->bi_status) - goto end; - - ASSERT(bio->bi_vcnt == 1); - io_tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; - ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode)); - - done->uptodate = 1; - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) - clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, - io_tree, done->start, bvec->bv_page, - btrfs_ino(BTRFS_I(inode)), 0); -end: - complete(&done->done); - bio_put(bio); + refcount_inc(&dip->refs); + ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (ret) + refcount_dec(&dip->refs); + return ret; } -static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode, - struct btrfs_io_bio *io_bio) +static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, + struct btrfs_io_bio *io_bio, + const bool uptodate) { - struct btrfs_fs_info *fs_info; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - struct btrfs_retry_complete done; - u64 start; - unsigned int pgoff; - u32 sectorsize; - int nr_sectors; - blk_status_t ret; + u64 start = io_bio->logical; + int icsum = 0; blk_status_t err = BLK_STS_OK; - fs_info = BTRFS_I(inode)->root->fs_info; - sectorsize = fs_info->sectorsize; - - start = io_bio->logical; - done.inode = inode; - io_bio->bio.bi_iter = io_bio->iter; + __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { + unsigned int i, nr_sectors, pgoff; - bio_for_each_segment(bvec, &io_bio->bio, iter) { nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); pgoff = bvec.bv_offset; - -next_block_or_try_again: - done.uptodate = 0; - done.start = start; - init_completion(&done.done); - - ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, - btrfs_retry_endio_nocsum, &done); - if (ret) { - err = ret; - goto next; - } - - wait_for_completion_io(&done.done); - - if (!done.uptodate) { - /* We might have another mirror, so try again */ - goto next_block_or_try_again; - } - -next: - start += sectorsize; - - nr_sectors--; - if (nr_sectors) { - pgoff += sectorsize; + for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - goto next_block_or_try_again; - } - } - - return err; -} - -static void btrfs_retry_endio(struct bio *bio) -{ - struct btrfs_retry_complete *done = bio->bi_private; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct extent_io_tree *io_tree, *failure_tree; - struct inode *inode = done->inode; - struct bio_vec *bvec; - int uptodate; - int ret; - int i = 0; - struct bvec_iter_all iter_all; - - if (bio->bi_status) - goto end; - - uptodate = 1; - - ASSERT(bio->bi_vcnt == 1); - ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode)); - - io_tree = &BTRFS_I(inode)->io_tree; - failure_tree = &BTRFS_I(inode)->io_failure_tree; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, - bvec->bv_offset, done->start, - bvec->bv_len); - if (!ret) - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, io_tree, done->start, - bvec->bv_page, - btrfs_ino(BTRFS_I(inode)), - bvec->bv_offset); - else - uptodate = 0; - i++; - } - - done->uptodate = uptodate; -end: - complete(&done->done); - bio_put(bio); -} - -static blk_status_t __btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, blk_status_t err) -{ - struct btrfs_fs_info *fs_info; - struct bio_vec bvec; - struct bvec_iter iter; - struct btrfs_retry_complete done; - u64 start; - u64 offset = 0; - u32 sectorsize; - int nr_sectors; - unsigned int pgoff; - int csum_pos; - bool uptodate = (err == 0); - int ret; - blk_status_t status; - - fs_info = BTRFS_I(inode)->root->fs_info; - sectorsize = fs_info->sectorsize; - - err = BLK_STS_OK; - start = io_bio->logical; - done.inode = inode; - io_bio->bio.bi_iter = io_bio->iter; - - bio_for_each_segment(bvec, &io_bio->bio, iter) { - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - - pgoff = bvec.bv_offset; -next_block: - if (uptodate) { - csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset); - ret = __readpage_endio_check(inode, io_bio, csum_pos, - bvec.bv_page, pgoff, start, sectorsize); - if (likely(!ret)) - goto next; - } -try_again: - done.uptodate = 0; - done.start = start; - init_completion(&done.done); - - status = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, btrfs_retry_endio, - &done); - if (status) { - err = status; - goto next; - } - - wait_for_completion_io(&done.done); - - if (!done.uptodate) { - /* We might have another mirror, so try again */ - goto try_again; - } -next: - offset += sectorsize; - start += sectorsize; - - ASSERT(nr_sectors); - - nr_sectors--; - if (nr_sectors) { + if (uptodate && + (!csum || !check_data_csum(inode, io_bio, icsum, + bvec.bv_page, pgoff, + start, sectorsize))) { + clean_io_failure(fs_info, failure_tree, io_tree, + start, bvec.bv_page, + btrfs_ino(BTRFS_I(inode)), + pgoff); + } else { + blk_status_t status; + + status = btrfs_submit_read_repair(inode, + &io_bio->bio, + start - io_bio->logical, + bvec.bv_page, pgoff, + start, + start + sectorsize - 1, + io_bio->mirror_num, + submit_dio_repair_bio); + if (status) + err = status; + } + start += sectorsize; + icsum++; pgoff += sectorsize; - ASSERT(pgoff < PAGE_SIZE); - goto next_block; } } - return err; } -static blk_status_t btrfs_subio_endio_read(struct inode *inode, - struct btrfs_io_bio *io_bio, blk_status_t err) -{ - bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - - if (skip_csum) { - if (unlikely(err)) - return __btrfs_correct_data_nocsum(inode, io_bio); - else - return BLK_STS_OK; - } else { - return __btrfs_subio_endio_read(inode, io_bio, err); - } -} - -static void btrfs_endio_direct_read(struct bio *bio) -{ - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; - struct bio *dio_bio; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - blk_status_t err = bio->bi_status; - - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) - err = btrfs_subio_endio_read(inode, io_bio, err); - - unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, - dip->logical_offset + dip->bytes - 1); - dio_bio = dip->dio_bio; - - kfree(dip); - - dio_bio->bi_status = err; - dio_end_io(dio_bio); - btrfs_io_bio_free_csum(io_bio); - bio_put(bio); -} - static void __endio_write_update_ordered(struct inode *inode, const u64 offset, const u64 bytes, const bool uptodate) @@ -7759,21 +7588,6 @@ static void __endio_write_update_ordered(struct inode *inode, } } -static void btrfs_endio_direct_write(struct bio *bio) -{ - struct btrfs_dio_private *dip = bio->bi_private; - struct bio *dio_bio = dip->dio_bio; - - __endio_write_update_ordered(dip->inode, dip->logical_offset, - dip->bytes, !bio->bi_status); - - kfree(dip); - - dio_bio->bi_status = bio->bi_status; - dio_end_io(dio_bio); - bio_put(bio); -} - static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data, struct bio *bio, u64 offset) { @@ -7797,64 +7611,16 @@ static void btrfs_end_dio_bio(struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); - if (dip->subio_endio) - err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); - - if (err) { - /* - * We want to perceive the errors flag being set before - * decrementing the reference count. We don't need a barrier - * since atomic operations with a return value are fully - * ordered as per atomic_t.txt - */ - dip->errors = 1; + if (bio_op(bio) == REQ_OP_READ) { + err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), + !err); } - /* if there are more bios still pending for this dio, just exit */ - if (!atomic_dec_and_test(&dip->pending_bios)) - goto out; + if (err) + dip->dio_bio->bi_status = err; - if (dip->errors) { - bio_io_error(dip->orig_bio); - } else { - dip->dio_bio->bi_status = BLK_STS_OK; - bio_endio(dip->orig_bio); - } -out: bio_put(bio); -} - -static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, - struct btrfs_dio_private *dip, - struct bio *bio, - u64 file_offset) -{ - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); - u16 csum_size; - blk_status_t ret; - - /* - * We load all the csum data we need when we submit - * the first bio to reduce the csum tree search and - * contention. - */ - if (dip->logical_offset == file_offset) { - ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset, - NULL); - if (ret) - return ret; - } - - if (bio == dip->orig_bio) - return 0; - - file_offset -= dip->logical_offset; - file_offset >>= inode->i_sb->s_blocksize_bits; - csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy); - io_bio->csum = orig_io_bio->csum + csum_size * file_offset; - - return 0; + btrfs_dio_private_put(dip); } static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, @@ -7892,10 +7658,12 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, if (ret) goto err; } else { - ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio, - file_offset); - if (ret) - goto err; + u64 csum_offset; + + csum_offset = file_offset - dip->logical_offset; + csum_offset >>= inode->i_sb->s_blocksize_bits; + csum_offset *= btrfs_super_csum_size(fs_info->super_copy); + btrfs_io_bio(bio)->csum = dip->csums + csum_offset; } map: ret = btrfs_map_bio(fs_info, bio, 0); @@ -7903,14 +7671,53 @@ err: return ret; } -static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) +/* + * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked + * or ordered extents whether or not we submit any bios. + */ +static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, + struct inode *inode, + loff_t file_offset) { - struct inode *inode = dip->inode; + const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); + size_t dip_size; + struct btrfs_dio_private *dip; + + dip_size = sizeof(*dip); + if (!write && csum) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + size_t nblocks; + + nblocks = dio_bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; + dip_size += csum_size * nblocks; + } + + dip = kzalloc(dip_size, GFP_NOFS); + if (!dip) + return NULL; + + dip->inode = inode; + dip->logical_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; + dip->dio_bio = dio_bio; + refcount_set(&dip->refs, 1); + return dip; +} + +static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) +{ + const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const bool raid56 = (btrfs_data_alloc_profile(fs_info) & + BTRFS_BLOCK_GROUP_RAID56_MASK); + struct btrfs_dio_private *dip; struct bio *bio; - struct bio *orig_bio = dip->orig_bio; - u64 start_sector = orig_bio->bi_iter.bi_sector; - u64 file_offset = dip->logical_offset; + u64 start_sector; int async_submit = 0; u64 submit_len; int clone_offset = 0; @@ -7918,330 +7725,108 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) int ret; blk_status_t status; struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; - submit_len = orig_bio->bi_iter.bi_size; - ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), - start_sector << 9, submit_len, &geom); - if (ret) - return -EIO; + dip = btrfs_create_dio_private(dio_bio, inode, file_offset); + if (!dip) { + if (!write) { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + dio_bio->bi_status = BLK_STS_RESOURCE; + bio_endio(dio_bio); + return BLK_QC_T_NONE; + } - if (geom.len >= submit_len) { - bio = orig_bio; - dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; - goto submit; + if (!write && csum) { + /* + * Load the csums up front to reduce csum tree searches and + * contention when submitting bios. + */ + status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset, + dip->csums); + if (status != BLK_STS_OK) + goto out_err; } - /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK) - async_submit = 0; - else - async_submit = 1; + start_sector = dio_bio->bi_iter.bi_sector; + submit_len = dio_bio->bi_iter.bi_size; - /* bio split */ - ASSERT(geom.len <= INT_MAX); - atomic_inc(&dip->pending_bios); do { + ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio), + start_sector << 9, submit_len, + &geom); + if (ret) { + status = errno_to_blk_status(ret); + goto out_err; + } + ASSERT(geom.len <= INT_MAX); + clone_len = min_t(int, submit_len, geom.len); /* * This will never fail as it's passing GPF_NOFS and * the allocation is backed by btrfs_bioset. */ - bio = btrfs_bio_clone_partial(orig_bio, clone_offset, - clone_len); + bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; ASSERT(submit_len >= clone_len); submit_len -= clone_len; - if (submit_len == 0) - break; /* * Increase the count before we submit the bio so we know * the end IO handler won't happen before we increase the * count. Otherwise, the dip might get freed before we're * done setting it up. + * + * We transfer the initial reference to the last bio, so we + * don't need to increment the reference count for the last one. */ - atomic_inc(&dip->pending_bios); + if (submit_len > 0) { + refcount_inc(&dip->refs); + /* + * If we are submitting more than one bio, submit them + * all asynchronously. The exception is RAID 5 or 6, as + * asynchronous checksums make it difficult to collect + * full stripe writes. + */ + if (!raid56) + async_submit = 1; + } status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); if (status) { bio_put(bio); - atomic_dec(&dip->pending_bios); + if (submit_len > 0) + refcount_dec(&dip->refs); goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; - - ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), - start_sector << 9, submit_len, &geom); - if (ret) - goto out_err; } while (submit_len > 0); + return BLK_QC_T_NONE; -submit: - status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); - if (!status) - return 0; - - bio_put(bio); out_err: - dip->errors = 1; - /* - * Before atomic variable goto zero, we must make sure dip->errors is - * perceived to be set. This ordering is ensured by the fact that an - * atomic operations with a return value are fully ordered as per - * atomic_t.txt - */ - if (atomic_dec_and_test(&dip->pending_bios)) - bio_io_error(dip->orig_bio); - - /* bio_end_io() will handle error, so we needn't return it */ - return 0; -} - -static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) -{ - struct btrfs_dio_private *dip = NULL; - struct bio *bio = NULL; - struct btrfs_io_bio *io_bio; - bool write = (bio_op(dio_bio) == REQ_OP_WRITE); - int ret = 0; - - bio = btrfs_bio_clone(dio_bio); - - dip = kzalloc(sizeof(*dip), GFP_NOFS); - if (!dip) { - ret = -ENOMEM; - goto free_ordered; - } - - dip->private = dio_bio->bi_private; - dip->inode = inode; - dip->logical_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; - bio->bi_private = dip; - dip->orig_bio = bio; - dip->dio_bio = dio_bio; - atomic_set(&dip->pending_bios, 0); - io_bio = btrfs_io_bio(bio); - io_bio->logical = file_offset; - - if (write) { - bio->bi_end_io = btrfs_endio_direct_write; - } else { - bio->bi_end_io = btrfs_endio_direct_read; - dip->subio_endio = btrfs_subio_endio_read; - } - - /* - * Reset the range for unsubmitted ordered extents (to a 0 length range) - * even if we fail to submit a bio, because in such case we do the - * corresponding error handling below and it must not be done a second - * time by btrfs_direct_IO(). - */ - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } - - ret = btrfs_submit_direct_hook(dip); - if (!ret) - return; - - btrfs_io_bio_free_csum(io_bio); - -free_ordered: - /* - * If we arrived here it means either we failed to submit the dip - * or we either failed to clone the dio_bio or failed to allocate the - * dip. If we cloned the dio_bio and allocated the dip, we can just - * call bio_endio against our io_bio so that we get proper resource - * cleanup if we fail to submit the dip, otherwise, we must do the - * same as btrfs_endio_direct_[write|read] because we can't call these - * callbacks - they require an allocated dip and a clone of dio_bio. - */ - if (bio && dip) { - bio_io_error(bio); - /* - * The end io callbacks free our dip, do the final put on bio - * and all the cleanup and final put for dio_bio (through - * dio_end_io()). - */ - dip = NULL; - bio = NULL; - } else { - if (write) - __endio_write_update_ordered(inode, - file_offset, - dio_bio->bi_iter.bi_size, - false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, - file_offset + dio_bio->bi_iter.bi_size - 1); - - dio_bio->bi_status = BLK_STS_IOERR; - /* - * Releases and cleans up our dio_bio, no need to bio_put() - * nor bio_endio()/bio_io_error() against dio_bio. - */ - dio_end_io(dio_bio); - } - if (bio) - bio_put(bio); - kfree(dip); + dip->dio_bio->bi_status = status; + btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } -static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - const struct iov_iter *iter, loff_t offset) -{ - int seg; - int i; - unsigned int blocksize_mask = fs_info->sectorsize - 1; - ssize_t retval = -EINVAL; - - if (offset & blocksize_mask) - goto out; - - if (iov_iter_alignment(iter) & blocksize_mask) - goto out; - - /* If this is a write we don't need to check anymore */ - if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) - return 0; - /* - * Check to make sure we don't have duplicate iov_base's in this - * iovec, if so return EINVAL, otherwise we'll get csum errors - * when reading back. - */ - for (seg = 0; seg < iter->nr_segs; seg++) { - for (i = seg + 1; i < iter->nr_segs; i++) { - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) - goto out; - } - } - retval = 0; -out: - return retval; -} - -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; - struct extent_changeset *data_reserved = NULL; - loff_t offset = iocb->ki_pos; - size_t count = 0; - int flags = 0; - bool wakeup = true; - bool relock = false; - ssize_t ret; - - if (check_direct_IO(fs_info, iter, offset)) - return 0; - - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ - count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - - if (iov_iter_rw(iter) == WRITE) { - /* - * If the write DIO is beyond the EOF, we need update - * the isize, but it is protected by i_mutex. So we can - * not unlock the i_mutex at this case. - */ - if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; - inode_unlock(inode); - relock = true; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; - } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; - down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; - } - - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { - up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(inode, data_reserved, - offset, dio_data.reserve, true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(inode, - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } -out: - if (wakeup) - inode_dio_end(inode); - if (relock) - inode_lock(inode); +const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; - extent_changeset_free(data_reserved); - return ret; -} +const struct iomap_dio_ops btrfs_dops = { + .submit_io = btrfs_submit_direct, +}; #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) @@ -10539,7 +10124,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = btrfs_direct_IO, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 40b729dce91c..168deb8ef68a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -660,7 +660,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; key.offset = (u64)-1; - new_root = btrfs_get_fs_root(fs_info, &key, true); + new_root = btrfs_get_fs_root(fs_info, objectid, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); @@ -748,9 +748,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; - bool snapshot_force_cow = false; - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; if (atomic_read(&root->nr_swapfiles)) { @@ -771,27 +770,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - /* - * Force new buffered writes to reserve space even when NOCOW is - * possible. This is to avoid later writeback (running dealloc) to - * fallback to COW mode and unexpectedly fail with ENOSPC. - */ - btrfs_drew_read_lock(&root->snapshot_lock); - - ret = btrfs_start_delalloc_snapshot(root); - if (ret) - goto dec_and_free; - - /* - * All previous writes have started writeback in NOCOW mode, so now - * we force future writes to fallback to COW mode during snapshot - * creation. - */ - atomic_inc(&root->snapshot_force_cow); - snapshot_force_cow = true; - - btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -806,7 +784,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->block_rsv, 8, false); if (ret) - goto dec_and_free; + goto free_pending; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -848,11 +826,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, fail: btrfs_put_root(pending_snapshot->snap); btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); -dec_and_free: - if (snapshot_force_cow) - atomic_dec(&root->snapshot_force_cow); - btrfs_drew_read_unlock(&root->snapshot_lock); - free_pending: kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); @@ -983,6 +956,45 @@ out_unlock: return error; } +static noinline int btrfs_mksnapshot(const struct path *parent, + const char *name, int namelen, + struct btrfs_root *root, + bool readonly, + struct btrfs_qgroup_inherit *inherit) +{ + int ret; + bool snapshot_force_cow = false; + + /* + * Force new buffered writes to reserve space even when NOCOW is + * possible. This is to avoid later writeback (running dealloc) to + * fallback to COW mode and unexpectedly fail with ENOSPC. + */ + btrfs_drew_read_lock(&root->snapshot_lock); + + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + goto out; + + /* + * All previous writes have started writeback in NOCOW mode, so now + * we force future writes to fallback to COW mode during snapshot + * creation. + */ + atomic_inc(&root->snapshot_force_cow); + snapshot_force_cow = true; + + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + + ret = btrfs_mksubvol(parent, name, namelen, + root, readonly, inherit); +out: + if (snapshot_force_cow) + atomic_dec(&root->snapshot_force_cow); + btrfs_drew_read_unlock(&root->snapshot_lock); + return ret; +} + /* * When we're defragging a range, we don't want to kick it off again * if it is really just waiting for delalloc to send it down. @@ -1762,7 +1774,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, */ ret = -EPERM; } else { - ret = btrfs_mksubvol(&file->f_path, name, namelen, + ret = btrfs_mksnapshot(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, readonly, inherit); } @@ -2127,10 +2139,7 @@ static noinline int search_ioctl(struct inode *inode, /* search the root of the inode that was passed */ root = btrfs_grab_root(BTRFS_I(inode)->root); } else { - key.objectid = sk->tree_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - root = btrfs_get_fs_root(info, &key, true); + root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); return PTR_ERR(root); @@ -2263,10 +2272,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1]; - key.objectid = tree_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - root = btrfs_get_fs_root(info, &key, true); + root = btrfs_get_fs_root(info, tree_id, true); if (IS_ERR(root)) { ret = PTR_ERR(root); root = NULL; @@ -2359,10 +2365,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, if (dirid != upper_limit.objectid) { ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; - key.objectid = treeid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - root = btrfs_get_fs_root(fs_info, &key, true); + root = btrfs_get_fs_root(fs_info, treeid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -2421,7 +2424,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, goto out_put; } - temp_inode = btrfs_iget(sb, &key2, root); + temp_inode = btrfs_iget(sb, key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; @@ -2608,9 +2611,7 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp) /* Get root_item of inode's subvolume */ key.objectid = BTRFS_I(inode)->root->root_key.objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - root = btrfs_get_fs_root(fs_info, &key, true); + root = btrfs_get_fs_root(fs_info, key.objectid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; @@ -3278,7 +3279,6 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_dir_item *di; struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; - struct btrfs_key location; struct btrfs_disk_key disk_key; u64 objectid = 0; u64 dir_id; @@ -3299,11 +3299,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) if (!objectid) objectid = BTRFS_FS_TREE_OBJECTID; - location.objectid = objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - - new_root = btrfs_get_fs_root(fs_info, &location, true); + new_root = btrfs_get_fs_root(fs_info, objectid, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); goto out; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index fb647d8cf527..f75612e18a82 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -410,6 +410,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * The rwlock is held for write upon exit. */ void btrfs_tree_lock(struct extent_buffer *eb) + __acquires(&eb->lock) { u64 start_ns = 0; diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 72bab64ecf60..6461ebc3a1c1 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/wait.h> #include <asm/div64.h> +#include <linux/rbtree.h> #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) @@ -58,4 +59,57 @@ static inline bool has_single_bit_set(u64 n) return is_power_of_two_u64(n); } +/* + * Simple bytenr based rb_tree relate structures + * + * Any structure wants to use bytenr as single search index should have their + * structure start with these members. + */ +struct rb_simple_node { + struct rb_node rb_node; + u64 bytenr; +}; + +static inline struct rb_node *rb_simple_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *node = root->rb_node; + struct rb_simple_node *entry; + + while (node) { + entry = rb_entry(node, struct rb_simple_node, rb_node); + + if (bytenr < entry->bytenr) + node = node->rb_left; + else if (bytenr > entry->bytenr) + node = node->rb_right; + else + return node; + } + return NULL; +} + +static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct rb_simple_node *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct rb_simple_node, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index ff1ff90e48b1..2dcb1cb21634 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -408,19 +408,14 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, struct btrfs_root *parent_root) { struct super_block *sb = root->fs_info->sb; - struct btrfs_key key; struct inode *parent_inode, *child_inode; int ret; - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - parent_inode = btrfs_iget(sb, &key, parent_root); + parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root); if (IS_ERR(parent_inode)) return PTR_ERR(parent_inode); - child_inode = btrfs_iget(sb, &key, root); + child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root); if (IS_ERR(child_inode)) { iput(parent_inode); return PTR_ERR(child_inode); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c3888fb367e7..5bd4089ad0e1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2622,6 +2622,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; + bool need_rescan = false; u32 level_size = 0; u64 nums; @@ -2765,6 +2766,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto unlock; } ++i_qgroups; + + /* + * If we're doing a snapshot, and adding the snapshot to a new + * qgroup, the numbers are guaranteed to be incorrect. + */ + if (srcid) + need_rescan = true; } for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { @@ -2784,6 +2792,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, dst->rfer = src->rfer - level_size; dst->rfer_cmpr = src->rfer_cmpr - level_size; + + /* Manually tweaking numbers certainly needs a rescan */ + need_rescan = true; } for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; @@ -2802,6 +2813,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, dst->excl = src->excl + level_size; dst->excl_cmpr = src->excl_cmpr + level_size; + need_rescan = true; } unlock: @@ -2809,6 +2821,8 @@ unlock: out: if (!committing) mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (need_rescan) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 03bc7134e8cb..3bbae80c752f 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -24,6 +24,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "backref.h" +#include "misc.h" /* * Relocation overview @@ -72,100 +73,15 @@ * The entry point of relocation is relocate_block_group() function. */ -/* - * backref_node, mapping_node and tree_block start with this - */ -struct tree_entry { - struct rb_node rb_node; - u64 bytenr; -}; - -/* - * present a tree block in the backref cache - */ -struct backref_node { - struct rb_node rb_node; - u64 bytenr; - - u64 new_bytenr; - /* objectid of tree block owner, can be not uptodate */ - u64 owner; - /* link to pending, changed or detached list */ - struct list_head list; - /* list of upper level blocks reference this block */ - struct list_head upper; - /* list of child blocks in the cache */ - struct list_head lower; - /* NULL if this node is not tree root */ - struct btrfs_root *root; - /* extent buffer got by COW the block */ - struct extent_buffer *eb; - /* level of tree block */ - unsigned int level:8; - /* is the block in non-reference counted tree */ - unsigned int cowonly:1; - /* 1 if no child node in the cache */ - unsigned int lowest:1; - /* is the extent buffer locked */ - unsigned int locked:1; - /* has the block been processed */ - unsigned int processed:1; - /* have backrefs of this block been checked */ - unsigned int checked:1; - /* - * 1 if corresponding block has been cowed but some upper - * level block pointers may not point to the new location - */ - unsigned int pending:1; - /* - * 1 if the backref node isn't connected to any other - * backref node. - */ - unsigned int detached:1; -}; - -/* - * present a block pointer in the backref cache - */ -struct backref_edge { - struct list_head list[2]; - struct backref_node *node[2]; -}; - -#define LOWER 0 -#define UPPER 1 #define RELOCATION_RESERVED_NODES 256 - -struct backref_cache { - /* red black tree of all backref nodes in the cache */ - struct rb_root rb_root; - /* for passing backref nodes to btrfs_reloc_cow_block */ - struct backref_node *path[BTRFS_MAX_LEVEL]; - /* - * list of blocks that have been cowed but some block - * pointers in upper level blocks may not reflect the - * new location - */ - struct list_head pending[BTRFS_MAX_LEVEL]; - /* list of backref nodes with no child node */ - struct list_head leaves; - /* list of blocks that have been cowed in current transaction */ - struct list_head changed; - /* list of detached backref node. */ - struct list_head detached; - - u64 last_trans; - - int nr_nodes; - int nr_edges; -}; - /* * map address of tree root to tree */ struct mapping_node { - struct rb_node rb_node; - u64 bytenr; + struct { + struct rb_node rb_node; + u64 bytenr; + }; /* Use rb_simle_node for search/insert */ void *data; }; @@ -178,8 +94,10 @@ struct mapping_tree { * present a tree block to process */ struct tree_block { - struct rb_node rb_node; - u64 bytenr; + struct { + struct rb_node rb_node; + u64 bytenr; + }; /* Use rb_simple_node for search/insert */ struct btrfs_key key; unsigned int level:8; unsigned int key_ready:1; @@ -204,7 +122,7 @@ struct reloc_control { struct btrfs_block_rsv *block_rsv; - struct backref_cache backref_cache; + struct btrfs_backref_cache backref_cache; struct file_extent_cluster cluster; /* tree blocks have been processed */ @@ -235,168 +153,41 @@ struct reloc_control { #define MOVE_DATA_EXTENTS 0 #define UPDATE_DATA_PTRS 1 -static void remove_backref_node(struct backref_cache *cache, - struct backref_node *node); -static void __mark_block_processed(struct reloc_control *rc, - struct backref_node *node); - -static void mapping_tree_init(struct mapping_tree *tree) -{ - tree->rb_root = RB_ROOT; - spin_lock_init(&tree->lock); -} - -static void backref_cache_init(struct backref_cache *cache) -{ - int i; - cache->rb_root = RB_ROOT; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) - INIT_LIST_HEAD(&cache->pending[i]); - INIT_LIST_HEAD(&cache->changed); - INIT_LIST_HEAD(&cache->detached); - INIT_LIST_HEAD(&cache->leaves); -} - -static void backref_cache_cleanup(struct backref_cache *cache) -{ - struct backref_node *node; - int i; - - while (!list_empty(&cache->detached)) { - node = list_entry(cache->detached.next, - struct backref_node, list); - remove_backref_node(cache, node); - } - - while (!list_empty(&cache->leaves)) { - node = list_entry(cache->leaves.next, - struct backref_node, lower); - remove_backref_node(cache, node); - } - - cache->last_trans = 0; - - for (i = 0; i < BTRFS_MAX_LEVEL; i++) - ASSERT(list_empty(&cache->pending[i])); - ASSERT(list_empty(&cache->changed)); - ASSERT(list_empty(&cache->detached)); - ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); - ASSERT(!cache->nr_nodes); - ASSERT(!cache->nr_edges); -} - -static struct backref_node *alloc_backref_node(struct backref_cache *cache) -{ - struct backref_node *node; - - node = kzalloc(sizeof(*node), GFP_NOFS); - if (node) { - INIT_LIST_HEAD(&node->list); - INIT_LIST_HEAD(&node->upper); - INIT_LIST_HEAD(&node->lower); - RB_CLEAR_NODE(&node->rb_node); - cache->nr_nodes++; - } - return node; -} - -static void free_backref_node(struct backref_cache *cache, - struct backref_node *node) -{ - if (node) { - cache->nr_nodes--; - btrfs_put_root(node->root); - kfree(node); - } -} - -static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) -{ - struct backref_edge *edge; - - edge = kzalloc(sizeof(*edge), GFP_NOFS); - if (edge) - cache->nr_edges++; - return edge; -} - -static void free_backref_edge(struct backref_cache *cache, - struct backref_edge *edge) +static void mark_block_processed(struct reloc_control *rc, + struct btrfs_backref_node *node) { - if (edge) { - cache->nr_edges--; - kfree(edge); - } -} + u32 blocksize; -static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (bytenr < entry->bytenr) - p = &(*p)->rb_left; - else if (bytenr > entry->bytenr) - p = &(*p)->rb_right; - else - return parent; + if (node->level == 0 || + in_range(node->bytenr, rc->block_group->start, + rc->block_group->length)) { + blocksize = rc->extent_root->fs_info->nodesize; + set_extent_bits(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY); } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; + node->processed = 1; } -static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) -{ - struct rb_node *n = root->rb_node; - struct tree_entry *entry; - - while (n) { - entry = rb_entry(n, struct tree_entry, rb_node); - if (bytenr < entry->bytenr) - n = n->rb_left; - else if (bytenr > entry->bytenr) - n = n->rb_right; - else - return n; - } - return NULL; -} - -static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) +static void mapping_tree_init(struct mapping_tree *tree) { - - struct btrfs_fs_info *fs_info = NULL; - struct backref_node *bnode = rb_entry(rb_node, struct backref_node, - rb_node); - if (bnode->root) - fs_info = bnode->root->fs_info; - btrfs_panic(fs_info, errno, - "Inconsistency in backref cache found at offset %llu", - bytenr); + tree->rb_root = RB_ROOT; + spin_lock_init(&tree->lock); } /* * walk up backref nodes until reach node presents tree root */ -static struct backref_node *walk_up_backref(struct backref_node *node, - struct backref_edge *edges[], - int *index) +static struct btrfs_backref_node *walk_up_backref( + struct btrfs_backref_node *node, + struct btrfs_backref_edge *edges[], int *index) { - struct backref_edge *edge; + struct btrfs_backref_edge *edge; int idx = *index; while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, - struct backref_edge, list[LOWER]); + struct btrfs_backref_edge, list[LOWER]); edges[idx++] = edge; node = edge->node[UPPER]; } @@ -408,11 +199,11 @@ static struct backref_node *walk_up_backref(struct backref_node *node, /* * walk down backref nodes to find start of next reference path */ -static struct backref_node *walk_down_backref(struct backref_edge *edges[], - int *index) +static struct btrfs_backref_node *walk_down_backref( + struct btrfs_backref_edge *edges[], int *index) { - struct backref_edge *edge; - struct backref_node *lower; + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *lower; int idx = *index; while (idx > 0) { @@ -423,7 +214,7 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[], continue; } edge = list_entry(edge->list[LOWER].next, - struct backref_edge, list[LOWER]); + struct btrfs_backref_edge, list[LOWER]); edges[idx - 1] = edge; *index = idx; return edge->node[UPPER]; @@ -432,95 +223,24 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[], return NULL; } -static void unlock_node_buffer(struct backref_node *node) -{ - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } -} - -static void drop_node_buffer(struct backref_node *node) -{ - if (node->eb) { - unlock_node_buffer(node); - free_extent_buffer(node->eb); - node->eb = NULL; - } -} - -static void drop_backref_node(struct backref_cache *tree, - struct backref_node *node) -{ - BUG_ON(!list_empty(&node->upper)); - - drop_node_buffer(node); - list_del(&node->list); - list_del(&node->lower); - if (!RB_EMPTY_NODE(&node->rb_node)) - rb_erase(&node->rb_node, &tree->rb_root); - free_backref_node(tree, node); -} - -/* - * remove a backref node from the backref cache - */ -static void remove_backref_node(struct backref_cache *cache, - struct backref_node *node) -{ - struct backref_node *upper; - struct backref_edge *edge; - - if (!node) - return; - - BUG_ON(!node->lowest && !node->detached); - while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, struct backref_edge, - list[LOWER]); - upper = edge->node[UPPER]; - list_del(&edge->list[LOWER]); - list_del(&edge->list[UPPER]); - free_backref_edge(cache, edge); - - if (RB_EMPTY_NODE(&upper->rb_node)) { - BUG_ON(!list_empty(&node->upper)); - drop_backref_node(cache, node); - node = upper; - node->lowest = 1; - continue; - } - /* - * add the node to leaf node list if no other - * child block cached. - */ - if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, &cache->leaves); - upper->lowest = 1; - } - } - - drop_backref_node(cache, node); -} - -static void update_backref_node(struct backref_cache *cache, - struct backref_node *node, u64 bytenr) +static void update_backref_node(struct btrfs_backref_cache *cache, + struct btrfs_backref_node *node, u64 bytenr) { struct rb_node *rb_node; rb_erase(&node->rb_node, &cache->rb_root); node->bytenr = bytenr; - rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node); if (rb_node) - backref_tree_panic(rb_node, -EEXIST, bytenr); + btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST); } /* * update backref cache after a transaction commit */ static int update_backref_cache(struct btrfs_trans_handle *trans, - struct backref_cache *cache) + struct btrfs_backref_cache *cache) { - struct backref_node *node; + struct btrfs_backref_node *node; int level = 0; if (cache->last_trans == 0) { @@ -538,13 +258,13 @@ static int update_backref_cache(struct btrfs_trans_handle *trans, */ while (!list_empty(&cache->detached)) { node = list_entry(cache->detached.next, - struct backref_node, list); - remove_backref_node(cache, node); + struct btrfs_backref_node, list); + btrfs_backref_cleanup_node(cache, node); } while (!list_empty(&cache->changed)) { node = list_entry(cache->changed.next, - struct backref_node, list); + struct btrfs_backref_node, list); list_del_init(&node->list); BUG_ON(node->pending); update_backref_node(cache, node, node->new_bytenr); @@ -585,7 +305,8 @@ static bool reloc_root_is_dead(struct btrfs_root *root) * * Reloc tree after swap is considered dead, thus not considered as valid. * This is enough for most callers, as they don't distinguish dead reloc root - * from no reloc root. But should_ignore_root() below is a special case. + * from no reloc root. But btrfs_should_ignore_reloc_root() below is a + * special case. */ static bool have_reloc_root(struct btrfs_root *root) { @@ -596,11 +317,11 @@ static bool have_reloc_root(struct btrfs_root *root) return true; } -static int should_ignore_root(struct btrfs_root *root) +int btrfs_should_ignore_reloc_root(struct btrfs_root *root) { struct btrfs_root *reloc_root; - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return 0; /* This root has been merged with its reloc tree, we can ignore it */ @@ -622,18 +343,20 @@ static int should_ignore_root(struct btrfs_root *root) */ return 1; } + /* * find reloc tree by address of tree root */ -static struct btrfs_root *find_reloc_root(struct reloc_control *rc, - u64 bytenr) +struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) { + struct reloc_control *rc = fs_info->reloc_ctl; struct rb_node *rb_node; struct mapping_node *node; struct btrfs_root *root = NULL; + ASSERT(rc); spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); + rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); root = (struct btrfs_root *)node->data; @@ -642,594 +365,165 @@ static struct btrfs_root *find_reloc_root(struct reloc_control *rc, return btrfs_grab_root(root); } -static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, - u64 root_objectid) +/* + * For useless nodes, do two major clean ups: + * + * - Cleanup the children edges and nodes + * If child node is also orphan (no parent) during cleanup, then the child + * node will also be cleaned up. + * + * - Freeing up leaves (level 0), keeps nodes detached + * For nodes, the node is still cached as "detached" + * + * Return false if @node is not in the @useless_nodes list. + * Return true if @node is in the @useless_nodes list. + */ +static bool handle_useless_nodes(struct reloc_control *rc, + struct btrfs_backref_node *node) { - struct btrfs_key key; + struct btrfs_backref_cache *cache = &rc->backref_cache; + struct list_head *useless_node = &cache->useless_node; + bool ret = false; - key.objectid = root_objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; + while (!list_empty(useless_node)) { + struct btrfs_backref_node *cur; - return btrfs_get_fs_root(fs_info, &key, false); -} + cur = list_first_entry(useless_node, struct btrfs_backref_node, + list); + list_del_init(&cur->list); -static noinline_for_stack -int find_inline_backref(struct extent_buffer *leaf, int slot, - unsigned long *ptr, unsigned long *end) -{ - struct btrfs_key key; - struct btrfs_extent_item *ei; - struct btrfs_tree_block_info *bi; - u32 item_size; + /* Only tree root nodes can be added to @useless_nodes */ + ASSERT(list_empty(&cur->upper)); - btrfs_item_key_to_cpu(leaf, &key, slot); + if (cur == node) + ret = true; - item_size = btrfs_item_size_nr(leaf, slot); - if (item_size < sizeof(*ei)) { - btrfs_print_v0_err(leaf->fs_info); - btrfs_handle_fs_error(leaf->fs_info, -EINVAL, NULL); - return 1; - } - ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); - WARN_ON(!(btrfs_extent_flags(leaf, ei) & - BTRFS_EXTENT_FLAG_TREE_BLOCK)); + /* The node is the lowest node */ + if (cur->lowest) { + list_del_init(&cur->lower); + cur->lowest = 0; + } - if (key.type == BTRFS_EXTENT_ITEM_KEY && - item_size <= sizeof(*ei) + sizeof(*bi)) { - WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); - return 1; - } - if (key.type == BTRFS_METADATA_ITEM_KEY && - item_size <= sizeof(*ei)) { - WARN_ON(item_size < sizeof(*ei)); - return 1; - } + /* Cleanup the lower edges */ + while (!list_empty(&cur->lower)) { + struct btrfs_backref_edge *edge; + struct btrfs_backref_node *lower; - if (key.type == BTRFS_EXTENT_ITEM_KEY) { - bi = (struct btrfs_tree_block_info *)(ei + 1); - *ptr = (unsigned long)(bi + 1); - } else { - *ptr = (unsigned long)(ei + 1); + edge = list_entry(cur->lower.next, + struct btrfs_backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + btrfs_backref_free_edge(cache, edge); + + /* Child node is also orphan, queue for cleanup */ + if (list_empty(&lower->upper)) + list_add(&lower->list, useless_node); + } + /* Mark this block processed for relocation */ + mark_block_processed(rc, cur); + + /* + * Backref nodes for tree leaves are deleted from the cache. + * Backref nodes for upper level tree blocks are left in the + * cache to avoid unnecessary backref lookup. + */ + if (cur->level > 0) { + list_add(&cur->list, &cache->detached); + cur->detached = 1; + } else { + rb_erase(&cur->rb_node, &cache->rb_root); + btrfs_backref_free_node(cache, cur); + } } - *end = (unsigned long)ei + item_size; - return 0; + return ret; } /* - * build backref tree for a given tree block. root of the backref tree - * corresponds the tree block, leaves of the backref tree correspond - * roots of b-trees that reference the tree block. + * Build backref tree for a given tree block. Root of the backref tree + * corresponds the tree block, leaves of the backref tree correspond roots of + * b-trees that reference the tree block. * - * the basic idea of this function is check backrefs of a given block - * to find upper level blocks that reference the block, and then check - * backrefs of these upper level blocks recursively. the recursion stop - * when tree root is reached or backrefs for the block is cached. + * The basic idea of this function is check backrefs of a given block to find + * upper level blocks that reference the block, and then check backrefs of + * these upper level blocks recursively. The recursion stops when tree root is + * reached or backrefs for the block is cached. * - * NOTE: if we find backrefs for a block are cached, we know backrefs - * for all upper level blocks that directly/indirectly reference the - * block are also cached. + * NOTE: if we find that backrefs for a block are cached, we know backrefs for + * all upper level blocks that directly/indirectly reference the block are also + * cached. */ -static noinline_for_stack -struct backref_node *build_backref_tree(struct reloc_control *rc, - struct btrfs_key *node_key, - int level, u64 bytenr) +static noinline_for_stack struct btrfs_backref_node *build_backref_tree( + struct reloc_control *rc, struct btrfs_key *node_key, + int level, u64 bytenr) { - struct backref_cache *cache = &rc->backref_cache; - struct btrfs_path *path1; /* For searching extent root */ - struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */ - struct extent_buffer *eb; - struct btrfs_root *root; - struct backref_node *cur; - struct backref_node *upper; - struct backref_node *lower; - struct backref_node *node = NULL; - struct backref_node *exist = NULL; - struct backref_edge *edge; - struct rb_node *rb_node; - struct btrfs_key key; - unsigned long end; - unsigned long ptr; - LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */ - LIST_HEAD(useless); - int cowonly; + struct btrfs_backref_iter *iter; + struct btrfs_backref_cache *cache = &rc->backref_cache; + /* For searching parent of TREE_BLOCK_REF */ + struct btrfs_path *path; + struct btrfs_backref_node *cur; + struct btrfs_backref_node *node = NULL; + struct btrfs_backref_edge *edge; int ret; int err = 0; - bool need_check = true; - path1 = btrfs_alloc_path(); - path2 = btrfs_alloc_path(); - if (!path1 || !path2) { + iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); + if (!iter) + return ERR_PTR(-ENOMEM); + path = btrfs_alloc_path(); + if (!path) { err = -ENOMEM; goto out; } - node = alloc_backref_node(cache); + node = btrfs_backref_alloc_node(cache, bytenr, level); if (!node) { err = -ENOMEM; goto out; } - node->bytenr = bytenr; - node->level = level; node->lowest = 1; cur = node; -again: - end = 0; - ptr = 0; - key.objectid = cur->bytenr; - key.type = BTRFS_METADATA_ITEM_KEY; - key.offset = (u64)-1; - - path1->search_commit_root = 1; - path1->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, - 0, 0); - if (ret < 0) { - err = ret; - goto out; - } - ASSERT(ret); - ASSERT(path1->slots[0]); - - path1->slots[0]--; - WARN_ON(cur->checked); - if (!list_empty(&cur->upper)) { - /* - * the backref was added previously when processing - * backref of type BTRFS_TREE_BLOCK_REF_KEY - */ - ASSERT(list_is_singular(&cur->upper)); - edge = list_entry(cur->upper.next, struct backref_edge, - list[LOWER]); - ASSERT(list_empty(&edge->list[UPPER])); - exist = edge->node[UPPER]; - /* - * add the upper level block to pending list if we need - * check its backrefs - */ - if (!exist->checked) - list_add_tail(&edge->list[UPPER], &list); - } else { - exist = NULL; - } - - while (1) { - cond_resched(); - eb = path1->nodes[0]; - - if (ptr >= end) { - if (path1->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(rc->extent_root, path1); - if (ret < 0) { - err = ret; - goto out; - } - if (ret > 0) - break; - eb = path1->nodes[0]; - } - - btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); - if (key.objectid != cur->bytenr) { - WARN_ON(exist); - break; - } - - if (key.type == BTRFS_EXTENT_ITEM_KEY || - key.type == BTRFS_METADATA_ITEM_KEY) { - ret = find_inline_backref(eb, path1->slots[0], - &ptr, &end); - if (ret) - goto next; - } - } - - if (ptr < end) { - /* update key for inline back ref */ - struct btrfs_extent_inline_ref *iref; - int type; - iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_get_extent_inline_ref_type(eb, iref, - BTRFS_REF_TYPE_BLOCK); - if (type == BTRFS_REF_TYPE_INVALID) { - err = -EUCLEAN; - goto out; - } - key.type = type; - key.offset = btrfs_extent_inline_ref_offset(eb, iref); - - WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && - key.type != BTRFS_SHARED_BLOCK_REF_KEY); - } - - /* - * Parent node found and matches current inline ref, no need to - * rebuild this node for this inline ref. - */ - if (exist && - ((key.type == BTRFS_TREE_BLOCK_REF_KEY && - exist->owner == key.offset) || - (key.type == BTRFS_SHARED_BLOCK_REF_KEY && - exist->bytenr == key.offset))) { - exist = NULL; - goto next; - } - - /* SHARED_BLOCK_REF means key.offset is the parent bytenr */ - if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { - if (key.objectid == key.offset) { - /* - * Only root blocks of reloc trees use backref - * pointing to itself. - */ - root = find_reloc_root(rc, cur->bytenr); - ASSERT(root); - cur->root = root; - break; - } - - edge = alloc_backref_edge(cache); - if (!edge) { - err = -ENOMEM; - goto out; - } - rb_node = tree_search(&cache->rb_root, key.offset); - if (!rb_node) { - upper = alloc_backref_node(cache); - if (!upper) { - free_backref_edge(cache, edge); - err = -ENOMEM; - goto out; - } - upper->bytenr = key.offset; - upper->level = cur->level + 1; - /* - * backrefs for the upper level block isn't - * cached, add the block to pending list - */ - list_add_tail(&edge->list[UPPER], &list); - } else { - upper = rb_entry(rb_node, struct backref_node, - rb_node); - ASSERT(upper->checked); - INIT_LIST_HEAD(&edge->list[UPPER]); - } - list_add_tail(&edge->list[LOWER], &cur->upper); - edge->node[LOWER] = cur; - edge->node[UPPER] = upper; - - goto next; - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - err = -EINVAL; - btrfs_print_v0_err(rc->extent_root->fs_info); - btrfs_handle_fs_error(rc->extent_root->fs_info, err, - NULL); - goto out; - } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { - goto next; - } - - /* - * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset - * means the root objectid. We need to search the tree to get - * its parent bytenr. - */ - root = read_fs_root(rc->extent_root->fs_info, key.offset); - if (IS_ERR(root)) { - err = PTR_ERR(root); - goto out; - } - - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - cur->cowonly = 1; - - if (btrfs_root_level(&root->root_item) == cur->level) { - /* tree root */ - ASSERT(btrfs_root_bytenr(&root->root_item) == - cur->bytenr); - if (should_ignore_root(root)) { - btrfs_put_root(root); - list_add(&cur->list, &useless); - } else { - cur->root = root; - } - break; - } - - level = cur->level + 1; - - /* Search the tree to find parent blocks referring the block. */ - path2->search_commit_root = 1; - path2->skip_locking = 1; - path2->lowest_level = level; - ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); - path2->lowest_level = 0; + /* Breadth-first search to build backref cache */ + do { + ret = btrfs_backref_add_tree_node(cache, path, iter, node_key, + cur); if (ret < 0) { - btrfs_put_root(root); err = ret; goto out; } - if (ret > 0 && path2->slots[level] > 0) - path2->slots[level]--; - - eb = path2->nodes[level]; - if (btrfs_node_blockptr(eb, path2->slots[level]) != - cur->bytenr) { - btrfs_err(root->fs_info, - "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", - cur->bytenr, level - 1, - root->root_key.objectid, - node_key->objectid, node_key->type, - node_key->offset); - btrfs_put_root(root); - err = -ENOENT; - goto out; - } - lower = cur; - need_check = true; - - /* Add all nodes and edges in the path */ - for (; level < BTRFS_MAX_LEVEL; level++) { - if (!path2->nodes[level]) { - ASSERT(btrfs_root_bytenr(&root->root_item) == - lower->bytenr); - if (should_ignore_root(root)) { - btrfs_put_root(root); - list_add(&lower->list, &useless); - } else { - lower->root = root; - } - break; - } - - edge = alloc_backref_edge(cache); - if (!edge) { - btrfs_put_root(root); - err = -ENOMEM; - goto out; - } - - eb = path2->nodes[level]; - rb_node = tree_search(&cache->rb_root, eb->start); - if (!rb_node) { - upper = alloc_backref_node(cache); - if (!upper) { - btrfs_put_root(root); - free_backref_edge(cache, edge); - err = -ENOMEM; - goto out; - } - upper->bytenr = eb->start; - upper->owner = btrfs_header_owner(eb); - upper->level = lower->level + 1; - if (!test_bit(BTRFS_ROOT_REF_COWS, - &root->state)) - upper->cowonly = 1; - - /* - * if we know the block isn't shared - * we can void checking its backrefs. - */ - if (btrfs_block_can_be_shared(root, eb)) - upper->checked = 0; - else - upper->checked = 1; - - /* - * add the block to pending list if we - * need check its backrefs, we only do this once - * while walking up a tree as we will catch - * anything else later on. - */ - if (!upper->checked && need_check) { - need_check = false; - list_add_tail(&edge->list[UPPER], - &list); - } else { - if (upper->checked) - need_check = true; - INIT_LIST_HEAD(&edge->list[UPPER]); - } - } else { - upper = rb_entry(rb_node, struct backref_node, - rb_node); - ASSERT(upper->checked); - INIT_LIST_HEAD(&edge->list[UPPER]); - if (!upper->owner) - upper->owner = btrfs_header_owner(eb); - } - list_add_tail(&edge->list[LOWER], &lower->upper); - edge->node[LOWER] = lower; - edge->node[UPPER] = upper; - - if (rb_node) { - btrfs_put_root(root); - break; - } - lower = upper; - upper = NULL; - } - btrfs_release_path(path2); -next: - if (ptr < end) { - ptr += btrfs_extent_inline_ref_size(key.type); - if (ptr >= end) { - WARN_ON(ptr > end); - ptr = 0; - end = 0; - } - } - if (ptr >= end) - path1->slots[0]++; - } - btrfs_release_path(path1); - - cur->checked = 1; - WARN_ON(exist); - - /* the pending list isn't empty, take the first block to process */ - if (!list_empty(&list)) { - edge = list_entry(list.next, struct backref_edge, list[UPPER]); - list_del_init(&edge->list[UPPER]); - cur = edge->node[UPPER]; - goto again; - } - - /* - * everything goes well, connect backref nodes and insert backref nodes - * into the cache. - */ - ASSERT(node->checked); - cowonly = node->cowonly; - if (!cowonly) { - rb_node = tree_insert(&cache->rb_root, node->bytenr, - &node->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); - list_add_tail(&node->lower, &cache->leaves); - } - - list_for_each_entry(edge, &node->upper, list[LOWER]) - list_add_tail(&edge->list[UPPER], &list); - - while (!list_empty(&list)) { - edge = list_entry(list.next, struct backref_edge, list[UPPER]); - list_del_init(&edge->list[UPPER]); - upper = edge->node[UPPER]; - if (upper->detached) { - list_del(&edge->list[LOWER]); - lower = edge->node[LOWER]; - free_backref_edge(cache, edge); - if (list_empty(&lower->upper)) - list_add(&lower->list, &useless); - continue; - } - - if (!RB_EMPTY_NODE(&upper->rb_node)) { - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - - list_add_tail(&edge->list[UPPER], &upper->lower); - continue; - } - - if (!upper->checked) { - /* - * Still want to blow up for developers since this is a - * logic bug. - */ - ASSERT(0); - err = -EINVAL; - goto out; - } - if (cowonly != upper->cowonly) { - ASSERT(0); - err = -EINVAL; - goto out; - } - - if (!cowonly) { - rb_node = tree_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - if (rb_node) - backref_tree_panic(rb_node, -EEXIST, - upper->bytenr); + edge = list_first_entry_or_null(&cache->pending_edge, + struct btrfs_backref_edge, list[UPPER]); + /* + * The pending list isn't empty, take the first block to + * process + */ + if (edge) { + list_del_init(&edge->list[UPPER]); + cur = edge->node[UPPER]; } + } while (edge); - list_add_tail(&edge->list[UPPER], &upper->lower); - - list_for_each_entry(edge, &upper->upper, list[LOWER]) - list_add_tail(&edge->list[UPPER], &list); + /* Finish the upper linkage of newly added edges/nodes */ + ret = btrfs_backref_finish_upper_links(cache, node); + if (ret < 0) { + err = ret; + goto out; } - /* - * process useless backref nodes. backref nodes for tree leaves - * are deleted from the cache. backref nodes for upper level - * tree blocks are left in the cache to avoid unnecessary backref - * lookup. - */ - while (!list_empty(&useless)) { - upper = list_entry(useless.next, struct backref_node, list); - list_del_init(&upper->list); - ASSERT(list_empty(&upper->upper)); - if (upper == node) - node = NULL; - if (upper->lowest) { - list_del_init(&upper->lower); - upper->lowest = 0; - } - while (!list_empty(&upper->lower)) { - edge = list_entry(upper->lower.next, - struct backref_edge, list[UPPER]); - list_del(&edge->list[UPPER]); - list_del(&edge->list[LOWER]); - lower = edge->node[LOWER]; - free_backref_edge(cache, edge); - if (list_empty(&lower->upper)) - list_add(&lower->list, &useless); - } - __mark_block_processed(rc, upper); - if (upper->level > 0) { - list_add(&upper->list, &cache->detached); - upper->detached = 1; - } else { - rb_erase(&upper->rb_node, &cache->rb_root); - free_backref_node(cache, upper); - } - } + if (handle_useless_nodes(rc, node)) + node = NULL; out: - btrfs_free_path(path1); - btrfs_free_path(path2); + btrfs_backref_iter_free(iter); + btrfs_free_path(path); if (err) { - while (!list_empty(&useless)) { - lower = list_entry(useless.next, - struct backref_node, list); - list_del_init(&lower->list); - } - while (!list_empty(&list)) { - edge = list_first_entry(&list, struct backref_edge, - list[UPPER]); - list_del(&edge->list[UPPER]); - list_del(&edge->list[LOWER]); - lower = edge->node[LOWER]; - upper = edge->node[UPPER]; - free_backref_edge(cache, edge); - - /* - * Lower is no longer linked to any upper backref nodes - * and isn't in the cache, we can free it ourselves. - */ - if (list_empty(&lower->upper) && - RB_EMPTY_NODE(&lower->rb_node)) - list_add(&lower->list, &useless); - - if (!RB_EMPTY_NODE(&upper->rb_node)) - continue; - - /* Add this guy's upper edges to the list to process */ - list_for_each_entry(edge, &upper->upper, list[LOWER]) - list_add_tail(&edge->list[UPPER], &list); - if (list_empty(&upper->upper)) - list_add(&upper->list, &useless); - } - - while (!list_empty(&useless)) { - lower = list_entry(useless.next, - struct backref_node, list); - list_del_init(&lower->list); - if (lower == node) - node = NULL; - free_backref_node(cache, lower); - } - - remove_backref_node(cache, node); + btrfs_backref_error_cleanup(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); + ASSERT(list_empty(&cache->useless_node) && + list_empty(&cache->pending_edge)); return node; } @@ -1244,19 +538,19 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, struct btrfs_root *dest) { struct btrfs_root *reloc_root = src->reloc_root; - struct backref_cache *cache = &rc->backref_cache; - struct backref_node *node = NULL; - struct backref_node *new_node; - struct backref_edge *edge; - struct backref_edge *new_edge; + struct btrfs_backref_cache *cache = &rc->backref_cache; + struct btrfs_backref_node *node = NULL; + struct btrfs_backref_node *new_node; + struct btrfs_backref_edge *edge; + struct btrfs_backref_edge *new_edge; struct rb_node *rb_node; if (cache->last_trans > 0) update_backref_cache(trans, cache); - rb_node = tree_search(&cache->rb_root, src->commit_root->start); + rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start); if (rb_node) { - node = rb_entry(rb_node, struct backref_node, rb_node); + node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); if (node->detached) node = NULL; else @@ -1264,10 +558,10 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, } if (!node) { - rb_node = tree_search(&cache->rb_root, - reloc_root->commit_root->start); + rb_node = rb_simple_search(&cache->rb_root, + reloc_root->commit_root->start); if (rb_node) { - node = rb_entry(rb_node, struct backref_node, + node = rb_entry(rb_node, struct btrfs_backref_node, rb_node); BUG_ON(node->detached); } @@ -1276,12 +570,11 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, if (!node) return 0; - new_node = alloc_backref_node(cache); + new_node = btrfs_backref_alloc_node(cache, dest->node->start, + node->level); if (!new_node) return -ENOMEM; - new_node->bytenr = dest->node->start; - new_node->level = node->level; new_node->lowest = node->lowest; new_node->checked = 1; new_node->root = btrfs_grab_root(dest); @@ -1289,23 +582,21 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, if (!node->lowest) { list_for_each_entry(edge, &node->lower, list[UPPER]) { - new_edge = alloc_backref_edge(cache); + new_edge = btrfs_backref_alloc_edge(cache); if (!new_edge) goto fail; - new_edge->node[UPPER] = new_node; - new_edge->node[LOWER] = edge->node[LOWER]; - list_add_tail(&new_edge->list[UPPER], - &new_node->lower); + btrfs_backref_link_edge(new_edge, edge->node[LOWER], + new_node, LINK_UPPER); } } else { list_add_tail(&new_node->lower, &cache->leaves); } - rb_node = tree_insert(&cache->rb_root, new_node->bytenr, - &new_node->rb_node); + rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr, + &new_node->rb_node); if (rb_node) - backref_tree_panic(rb_node, -EEXIST, new_node->bytenr); + btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST); if (!new_node->lowest) { list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { @@ -1317,11 +608,11 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, fail: while (!list_empty(&new_node->lower)) { new_edge = list_entry(new_node->lower.next, - struct backref_edge, list[UPPER]); + struct btrfs_backref_edge, list[UPPER]); list_del(&new_edge->list[UPPER]); - free_backref_edge(cache, new_edge); + btrfs_backref_free_edge(cache, new_edge); } - free_backref_node(cache, new_node); + btrfs_backref_free_node(cache, new_node); return -ENOMEM; } @@ -1343,8 +634,8 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) node->data = root; spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { btrfs_panic(fs_info, -EEXIST, @@ -1370,8 +661,8 @@ static void __del_reloc_root(struct btrfs_root *root) if (rc && root->node) { spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); + rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1414,8 +705,8 @@ static int __update_reloc_root(struct btrfs_root *root) struct reloc_control *rc = fs_info->reloc_ctl; spin_lock(&rc->reloc_root_tree.lock); - rb_node = tree_search(&rc->reloc_root_tree.rb_root, - root->commit_root->start); + rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); @@ -1428,11 +719,11 @@ static int __update_reloc_root(struct btrfs_root *root) spin_lock(&rc->reloc_root_tree.lock); node->bytenr = root->node->start; - rb_node = tree_insert(&rc->reloc_root_tree.rb_root, - node->bytenr, &node->rb_node); + rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) - backref_tree_panic(rb_node, -EEXIST, node->bytenr); + btrfs_backref_panic(fs_info, node->bytenr, -EEXIST); return 0; } @@ -1505,7 +796,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); - set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); + set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; } @@ -1679,14 +970,6 @@ again: return NULL; } -static int in_block_group(u64 bytenr, struct btrfs_block_group *block_group) -{ - if (bytenr >= block_group->start && - bytenr < block_group->start + block_group->length) - return 1; - return 0; -} - /* * get new location of data */ @@ -1784,7 +1067,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); if (bytenr == 0) continue; - if (!in_block_group(bytenr, rc->block_group)) + if (!in_range(bytenr, rc->block_group->start, + rc->block_group->length)) continue; /* @@ -1940,7 +1224,7 @@ again: level = btrfs_header_level(parent); BUG_ON(level < lowest_level); - ret = btrfs_bin_search(parent, &key, level, &slot); + ret = btrfs_bin_search(parent, &key, &slot); if (ret < 0) break; if (ret && slot > 0) @@ -2560,7 +1844,8 @@ again: struct btrfs_root, root_list); list_del_init(&reloc_root->root_list); - root = read_fs_root(fs_info, reloc_root->root_key.offset); + root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, + false); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); @@ -2588,13 +1873,10 @@ again: static noinline_for_stack void free_reloc_roots(struct list_head *list) { - struct btrfs_root *reloc_root; + struct btrfs_root *reloc_root, *tmp; - while (!list_empty(list)) { - reloc_root = list_entry(list->next, struct btrfs_root, - root_list); + list_for_each_entry_safe(reloc_root, tmp, list, root_list) __del_reloc_root(reloc_root); - } } static noinline_for_stack @@ -2624,12 +1906,11 @@ again: reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); + root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, + false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - root = read_fs_root(fs_info, - reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); - ret = merge_reloc_root(rc, root); btrfs_put_root(root); if (ret) { @@ -2639,6 +1920,16 @@ again: goto out; } } else { + if (!IS_ERR(root)) { + if (root->reloc_root == reloc_root) { + root->reloc_root = NULL; + btrfs_put_root(reloc_root); + } + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, + &root->state); + btrfs_put_root(root); + } + list_del_init(&reloc_root->root_list); /* Don't forget to queue this reloc root for cleanup */ list_add_tail(&reloc_root->reloc_dirty_list, @@ -2653,15 +1944,13 @@ again: out: if (ret) { btrfs_handle_fs_error(fs_info, ret, NULL); - if (!list_empty(&reloc_roots)) - free_reloc_roots(&reloc_roots); + free_reloc_roots(&reloc_roots); /* new reloc root may be added */ mutex_lock(&fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); mutex_unlock(&fs_info->reloc_mutex); - if (!list_empty(&reloc_roots)) - free_reloc_roots(&reloc_roots); + free_reloc_roots(&reloc_roots); } /* @@ -2702,7 +1991,7 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, if (reloc_root->last_trans == trans->transid) return 0; - root = read_fs_root(fs_info, reloc_root->root_key.offset); + root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); ret = btrfs_record_root_in_trans(trans, root); @@ -2714,10 +2003,10 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, static noinline_for_stack struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct reloc_control *rc, - struct backref_node *node, - struct backref_edge *edges[]) + struct btrfs_backref_node *node, + struct btrfs_backref_edge *edges[]) { - struct backref_node *next; + struct btrfs_backref_node *next; struct btrfs_root *root; int index = 0; @@ -2727,7 +2016,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, next = walk_up_backref(next, edges, &index); root = next->root; BUG_ON(!root); - BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state)); + BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); @@ -2746,7 +2035,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, ASSERT(next->root); list_add_tail(&next->list, &rc->backref_cache.changed); - __mark_block_processed(rc, next); + mark_block_processed(rc, next); break; } @@ -2771,18 +2060,21 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, } /* - * select a tree root for relocation. return NULL if the block - * is reference counted. we should use do_relocation() in this - * case. return a tree root pointer if the block isn't reference - * counted. return -ENOENT if the block is root of reloc tree. + * Select a tree root for relocation. + * + * Return NULL if the block is not shareable. We should use do_relocation() in + * this case. + * + * Return a tree root pointer if the block is shareable. + * Return -ENOENT if the block is root of reloc tree. */ static noinline_for_stack -struct btrfs_root *select_one_root(struct backref_node *node) +struct btrfs_root *select_one_root(struct btrfs_backref_node *node) { - struct backref_node *next; + struct btrfs_backref_node *next; struct btrfs_root *root; struct btrfs_root *fs_root = NULL; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; int index = 0; next = node; @@ -2792,8 +2084,8 @@ struct btrfs_root *select_one_root(struct backref_node *node) root = next->root; BUG_ON(!root); - /* no other choice for non-references counted tree */ - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + /* No other choice for non-shareable tree */ + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return root; if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) @@ -2814,12 +2106,12 @@ struct btrfs_root *select_one_root(struct backref_node *node) static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, - struct backref_node *node, int reserve) + struct btrfs_backref_node *node, int reserve) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct backref_node *next = node; - struct backref_edge *edge; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_backref_node *next = node; + struct btrfs_backref_edge *edge; + struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; u64 num_bytes = 0; int index = 0; @@ -2837,7 +2129,7 @@ u64 calcu_metadata_size(struct reloc_control *rc, break; edge = list_entry(next->upper.next, - struct backref_edge, list[LOWER]); + struct btrfs_backref_edge, list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2848,7 +2140,7 @@ u64 calcu_metadata_size(struct reloc_control *rc, static int reserve_metadata_space(struct btrfs_trans_handle *trans, struct reloc_control *rc, - struct backref_node *node) + struct btrfs_backref_node *node) { struct btrfs_root *root = rc->extent_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -2896,14 +2188,14 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, */ static int do_relocation(struct btrfs_trans_handle *trans, struct reloc_control *rc, - struct backref_node *node, + struct btrfs_backref_node *node, struct btrfs_key *key, struct btrfs_path *path, int lowest) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct backref_node *upper; - struct backref_edge *edge; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_backref_node *upper; + struct btrfs_backref_edge *edge; + struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; struct btrfs_root *root; struct extent_buffer *eb; u32 blocksize; @@ -2929,8 +2221,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (upper->eb && !upper->locked) { if (!lowest) { - ret = btrfs_bin_search(upper->eb, key, - upper->level, &slot); + ret = btrfs_bin_search(upper->eb, key, &slot); if (ret < 0) { err = ret; goto next; @@ -2940,7 +2231,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (node->eb->start == bytenr) goto next; } - drop_node_buffer(upper); + btrfs_backref_drop_node_buffer(upper); } if (!upper->eb) { @@ -2968,8 +2259,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, slot = path->slots[upper->level]; btrfs_release_path(path); } else { - ret = btrfs_bin_search(upper->eb, key, upper->level, - &slot); + ret = btrfs_bin_search(upper->eb, key, &slot); if (ret < 0) { err = ret; goto next; @@ -3039,15 +2329,15 @@ static int do_relocation(struct btrfs_trans_handle *trans, } next: if (!upper->pending) - drop_node_buffer(upper); + btrfs_backref_drop_node_buffer(upper); else - unlock_node_buffer(upper); + btrfs_backref_unlock_node_buffer(upper); if (err) break; } if (!err && node->pending) { - drop_node_buffer(node); + btrfs_backref_drop_node_buffer(node); list_move_tail(&node->list, &rc->backref_cache.changed); node->pending = 0; } @@ -3059,7 +2349,7 @@ next: static int link_to_upper(struct btrfs_trans_handle *trans, struct reloc_control *rc, - struct backref_node *node, + struct btrfs_backref_node *node, struct btrfs_path *path) { struct btrfs_key key; @@ -3073,15 +2363,15 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans, struct btrfs_path *path, int err) { LIST_HEAD(list); - struct backref_cache *cache = &rc->backref_cache; - struct backref_node *node; + struct btrfs_backref_cache *cache = &rc->backref_cache; + struct btrfs_backref_node *node; int level; int ret; for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { node = list_entry(cache->pending[level].next, - struct backref_node, list); + struct btrfs_backref_node, list); list_move_tail(&node->list, &list); BUG_ON(!node->pending); @@ -3096,35 +2386,16 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans, return err; } -static void mark_block_processed(struct reloc_control *rc, - u64 bytenr, u32 blocksize) -{ - set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, - EXTENT_DIRTY); -} - -static void __mark_block_processed(struct reloc_control *rc, - struct backref_node *node) -{ - u32 blocksize; - if (node->level == 0 || - in_block_group(node->bytenr, rc->block_group)) { - blocksize = rc->extent_root->fs_info->nodesize; - mark_block_processed(rc, node->bytenr, blocksize); - } - node->processed = 1; -} - /* * mark a block and all blocks directly/indirectly reference the block * as processed. */ static void update_processed_blocks(struct reloc_control *rc, - struct backref_node *node) + struct btrfs_backref_node *node) { - struct backref_node *next = node; - struct backref_edge *edge; - struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_backref_node *next = node; + struct btrfs_backref_edge *edge; + struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1]; int index = 0; while (next) { @@ -3133,13 +2404,13 @@ static void update_processed_blocks(struct reloc_control *rc, if (next->processed) break; - __mark_block_processed(rc, next); + mark_block_processed(rc, next); if (list_empty(&next->upper)) break; edge = list_entry(next->upper.next, - struct backref_edge, list[LOWER]); + struct btrfs_backref_edge, list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -3184,7 +2455,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, */ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct reloc_control *rc, - struct backref_node *node, + struct btrfs_backref_node *node, struct btrfs_key *key, struct btrfs_path *path) { @@ -3210,7 +2481,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, } if (root) { - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { BUG_ON(node->new_bytenr); BUG_ON(!list_empty(&node->list)); btrfs_record_root_in_trans(trans, root); @@ -3234,7 +2505,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, } out: if (ret || node->level == 0 || node->cowonly) - remove_backref_node(&rc->backref_cache, node); + btrfs_backref_cleanup_node(&rc->backref_cache, node); return ret; } @@ -3246,7 +2517,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct rb_root *blocks) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct backref_node *node; + struct btrfs_backref_node *node; struct btrfs_path *path; struct tree_block *block; struct tree_block *next; @@ -3613,9 +2884,10 @@ static int add_tree_block(struct reloc_control *rc, block->level = level; block->key_ready = 0; - rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); if (rb_node) - backref_tree_panic(rb_node, -EEXIST, block->bytenr); + btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr, + -EEXIST); return 0; } @@ -3636,7 +2908,7 @@ static int __add_tree_block(struct reloc_control *rc, if (tree_block_processed(bytenr, rc)) return 0; - if (tree_search(blocks, bytenr)) + if (rb_simple_search(blocks, bytenr)) return 0; path = btrfs_alloc_path(); @@ -3698,7 +2970,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, struct inode *inode, u64 ino) { - struct btrfs_key key; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int ret = 0; @@ -3706,11 +2977,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, if (inode) goto truncate; - key.objectid = ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, root); + inode = btrfs_iget(fs_info->sb, ino, root); if (IS_ERR(inode)) return -ENOENT; @@ -4122,7 +3389,7 @@ restart: rc->create_reloc_tree = 0; set_reloc_control(rc); - backref_cache_cleanup(&rc->backref_cache); + btrfs_backref_release_cache(&rc->backref_cache); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* @@ -4198,14 +3465,10 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, struct inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; - struct btrfs_key key; u64 objectid; int err = 0; - root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(root)) - return ERR_CAST(root); - + root = btrfs_grab_root(fs_info->data_reloc_root); trans = btrfs_start_transaction(root, 6); if (IS_ERR(trans)) { btrfs_put_root(root); @@ -4219,10 +3482,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, err = __insert_orphan_inode(trans, root, objectid); BUG_ON(err); - key.objectid = objectid; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root); + inode = btrfs_iget(fs_info->sb, objectid, root); BUG_ON(IS_ERR(inode)); BTRFS_I(inode)->index_cnt = group->start; @@ -4249,7 +3509,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); INIT_LIST_HEAD(&rc->dirty_subvol_roots); - backref_cache_init(&rc->backref_cache); + btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); mapping_tree_init(&rc->reloc_root_tree); extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS, NULL); @@ -4494,12 +3754,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out; } - set_bit(BTRFS_ROOT_REF_COWS, &reloc_root->state); + set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); list_add(&reloc_root->root_list, &reloc_roots); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - fs_root = read_fs_root(fs_info, - reloc_root->root_key.offset); + fs_root = btrfs_get_fs_root(fs_info, + reloc_root->root_key.offset, false); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); if (ret != -ENOENT) { @@ -4555,7 +3815,8 @@ int btrfs_recover_relocation(struct btrfs_root *root) continue; } - fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); + fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, + false); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); @@ -4591,20 +3852,16 @@ out_unset: unset_reloc_control(rc); free_reloc_control(rc); out: - if (!list_empty(&reloc_roots)) - free_reloc_roots(&reloc_roots); + free_reloc_roots(&reloc_roots); btrfs_free_path(path); if (err == 0) { /* cleanup orphan inode in data relocation tree */ - fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); - if (IS_ERR(fs_root)) { - err = PTR_ERR(fs_root); - } else { - err = btrfs_orphan_cleanup(fs_root); - btrfs_put_root(fs_root); - } + fs_root = btrfs_grab_root(fs_info->data_reloc_root); + ASSERT(fs_root); + err = btrfs_orphan_cleanup(fs_root); + btrfs_put_root(fs_root); } return err; } @@ -4666,7 +3923,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct reloc_control *rc; - struct backref_node *node; + struct btrfs_backref_node *node; int first_cow = 0; int level; int ret = 0; @@ -4691,7 +3948,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, BUG_ON(node->bytenr != buf->start && node->new_bytenr != buf->start); - drop_node_buffer(node); + btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs); node->eb = cow; node->new_bytenr = cow->start; @@ -4703,7 +3960,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, } if (first_cow) - __mark_block_processed(rc, node); + mark_block_processed(rc, node); if (first_cow && level > 0) rc->nodes_relocated += buf->len; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 668f22844017..c89697486366 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -210,7 +210,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_key root_key; struct btrfs_root *root; int err = 0; int ret; @@ -223,10 +222,9 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = 0; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - while (1) { + u64 root_objectid; + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); if (ret < 0) { err = ret; @@ -250,10 +248,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) key.type != BTRFS_ORPHAN_ITEM_KEY) break; - root_key.objectid = key.offset; + root_objectid = key.offset; key.offset++; - root = btrfs_get_fs_root(fs_info, &root_key, false); + root = btrfs_get_fs_root(fs_info, root_objectid, false); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { break; @@ -270,7 +268,7 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) break; } err = btrfs_del_orphan_item(trans, tree_root, - root_key.objectid); + root_objectid); btrfs_end_transaction(trans); if (err) { btrfs_handle_fs_error(fs_info, err, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index adaf8ab694d5..016a025e36c7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -647,13 +647,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, struct btrfs_fs_info *fs_info = swarn->dev->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; - struct btrfs_key root_key; struct btrfs_key key; - root_key.objectid = root; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - local_root = btrfs_get_fs_root(fs_info, &root_key, true); + local_root = btrfs_get_fs_root(fs_info, root, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; @@ -3046,7 +3042,8 @@ out: static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, - int num, u64 base, u64 length) + int num, u64 base, u64 length, + struct btrfs_block_group *cache) { struct btrfs_path *path, *ppath; struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -3284,6 +3281,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, break; } + /* + * If our block group was removed in the meanwhile, just + * stop scrubbing since there is no point in continuing. + * Continuing would prevent reusing its device extents + * for new block groups for a long time. + */ + spin_lock(&cache->lock); + if (cache->removed) { + spin_unlock(&cache->lock); + ret = 0; + goto out; + } + spin_unlock(&cache->lock); + extent = btrfs_item_ptr(l, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(l, extent); @@ -3328,13 +3339,14 @@ again: &extent_dev, &extent_mirror_num); - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + - extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; + if (flags & BTRFS_EXTENT_FLAG_DATA) { + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + extent_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + } ret = scrub_extent(sctx, map, extent_logical, extent_len, extent_physical, extent_dev, flags, @@ -3457,7 +3469,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sctx, map, scrub_dev, i, - chunk_offset, length); + chunk_offset, length, cache); if (ret) goto out; } @@ -3555,6 +3567,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, goto skip; /* + * Make sure that while we are scrubbing the corresponding block + * group doesn't get its logical address and its device extents + * reused for another block group, which can possibly be of a + * different type and different profile. We do this to prevent + * false error detections and crashes due to bogus attempts to + * repair extents. + */ + spin_lock(&cache->lock); + if (cache->removed) { + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + goto skip; + } + btrfs_freeze_block_group(cache); + spin_unlock(&cache->lock); + + /* * we need call btrfs_inc_block_group_ro() with scrubs_paused, * to avoid deadlock caused by: * btrfs_inc_block_group_ro() @@ -3609,6 +3638,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, } else { btrfs_warn(fs_info, "failed setting block group ro: %d", ret); + btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); scrub_pause_off(fs_info); break; @@ -3695,6 +3725,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, spin_unlock(&cache->lock); } + btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); if (ret) break; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6a92ecf9eaa2..d9813a5b075a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -23,6 +23,7 @@ #include "btrfs_inode.h" #include "transaction.h" #include "compression.h" +#include "xattr.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -4545,6 +4546,10 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, struct fs_path *p; struct posix_acl_xattr_header dummy_acl; + /* Capabilities are emitted by finish_inode_if_needed */ + if (!strncmp(name, XATTR_NAME_CAPS, name_len)) + return 0; + p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -4801,17 +4806,12 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) struct inode *inode; struct page *page; char *addr; - struct btrfs_key key; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); ssize_t ret = 0; - key.objectid = sctx->cur_ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, root); + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -5107,6 +5107,64 @@ static int send_extent_data(struct send_ctx *sctx, return 0; } +/* + * Search for a capability xattr related to sctx->cur_ino. If the capability is + * found, call send_set_xattr function to emit it. + * + * Return 0 if there isn't a capability, or when the capability was emitted + * successfully, or < 0 if an error occurred. + */ +static int send_capabilities(struct send_ctx *sctx) +{ + struct fs_path *fspath = NULL; + struct btrfs_path *path; + struct btrfs_dir_item *di; + struct extent_buffer *leaf; + unsigned long data_ptr; + char *buf = NULL; + int buf_len; + int ret = 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, + XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); + if (!di) { + /* There is no xattr for this inode */ + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + leaf = path->nodes[0]; + buf_len = btrfs_dir_data_len(leaf, di); + + fspath = fs_path_alloc(); + buf = kmalloc(buf_len, GFP_KERNEL); + if (!fspath || !buf) { + ret = -ENOMEM; + goto out; + } + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); + read_extent_buffer(leaf, buf, data_ptr, buf_len); + + ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + strlen(XATTR_NAME_CAPS), buf, buf_len); +out: + kfree(buf); + fs_path_free(fspath); + btrfs_free_path(path); + return ret; +} + static int clone_range(struct send_ctx *sctx, struct clone_root *clone_root, const u64 disk_byte, @@ -5972,6 +6030,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; } + ret = send_capabilities(sctx); + if (ret < 0) + goto out; + /* * If other directory inodes depended on our current directory * inode's move/rename, now do their move/rename operations. @@ -7021,7 +7083,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; - struct btrfs_key key; struct send_ctx *sctx = NULL; u32 i; u64 *clone_sources_tmp = NULL; @@ -7143,11 +7204,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) } for (i = 0; i < arg->clone_sources_count; i++) { - key.objectid = clone_sources_tmp[i]; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - clone_root = btrfs_get_fs_root(fs_info, &key, true); + clone_root = btrfs_get_fs_root(fs_info, + clone_sources_tmp[i], true); if (IS_ERR(clone_root)) { ret = PTR_ERR(clone_root); goto out; @@ -7178,11 +7236,8 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) } if (arg->parent_root) { - key.objectid = arg->parent_root; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - sctx->parent_root = btrfs_get_fs_root(fs_info, &key, true); + sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root, + true); if (IS_ERR(sctx->parent_root)) { ret = PTR_ERR(sctx->parent_root); goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ff17a4420358..41ee88633769 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -626,6 +626,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; u64 bytes_needed; u64 reclaim_bytes = 0; @@ -688,6 +689,11 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs_rsv->lock); reclaim_bytes += delayed_refs_rsv->reserved; spin_unlock(&delayed_refs_rsv->lock); + + spin_lock(&trans_rsv->lock); + reclaim_bytes += trans_rsv->reserved; + spin_unlock(&trans_rsv->lock); + if (reclaim_bytes >= bytes_needed) goto commit; bytes_needed -= reclaim_bytes; @@ -856,6 +862,34 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } +static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != space_info) + return false; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, 1); + if (global_rsv->reserved < min_bytes + ticket->bytes) { + spin_unlock(&global_rsv->lock); + return false; + } + global_rsv->reserved -= ticket->bytes; + ticket->bytes = 0; + list_del_init(&ticket->list); + wake_up(&ticket->wait); + space_info->tickets_id++; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + return true; +} + /* * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets * @fs_info - fs_info for this fs @@ -888,6 +922,10 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); + if (ticket->steal && + steal_from_global_rsv(fs_info, space_info, ticket)) + return true; + /* * may_commit_transaction will avoid committing the transaction * if it doesn't feel like the space reclaimed by the commit @@ -1104,6 +1142,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, switch (flush) { case BTRFS_RESERVE_FLUSH_ALL: + case BTRFS_RESERVE_FLUSH_ALL_STEAL: wait_reserve_ticket(fs_info, space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: @@ -1125,11 +1164,17 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, ret = ticket->error; if (ticket->bytes || ticket->error) { /* - * Need to delete here for priority tickets. For regular tickets - * either the async reclaim job deletes the ticket from the list - * or we delete it ourselves at wait_reserve_ticket(). + * We were a priority ticket, so we need to delete ourselves + * from the list. Because we could have other priority tickets + * behind us that require less space, run + * btrfs_try_granting_tickets() to see if their reservations can + * now be made. */ - remove_ticket(space_info, ticket); + if (!list_empty(&ticket->list)) { + remove_ticket(space_info, ticket); + btrfs_try_granting_tickets(fs_info, space_info); + } + if (!ret) ret = -ENOSPC; } @@ -1145,6 +1190,16 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, return ret; } +/* + * This returns true if this flush state will go through the ordinary flushing + * code. + */ +static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) +{ + return (flush == BTRFS_RESERVE_FLUSH_ALL) || + (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); +} + /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for @@ -1175,8 +1230,17 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); - pending_tickets = !list_empty(&space_info->tickets) || - !list_empty(&space_info->priority_tickets); + + /* + * We don't want NO_FLUSH allocations to jump everybody, they can + * generally handle ENOSPC in a different way, so treat them the same as + * normal flushers when it comes to skipping pending tickets. + */ + if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH)) + pending_tickets = !list_empty(&space_info->tickets) || + !list_empty(&space_info->priority_tickets); + else + pending_tickets = !list_empty(&space_info->priority_tickets); /* * Carry on if we have enough space (short-circuit) OR call @@ -1198,12 +1262,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - ASSERT(space_info->reclaim_size >= 0); ticket.bytes = orig_bytes; ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); - if (flush == BTRFS_RESERVE_FLUSH_ALL) { + ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + if (flush == BTRFS_RESERVE_FLUSH_ALL || + flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { space_info->flush = 1; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 0a5001ef1481..c3c64019950a 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -78,6 +78,7 @@ struct btrfs_space_info { struct reserve_ticket { u64 bytes; int error; + bool steal; struct list_head list; wait_queue_head_t wait; }; diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 73f7987143df..079b059818e9 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -17,151 +17,152 @@ static inline void put_unaligned_le8(u8 val, void *p) *(u8 *)p = val; } +static bool check_setget_bounds(const struct extent_buffer *eb, + const void *ptr, unsigned off, int size) +{ + const unsigned long member_offset = (unsigned long)ptr + off; + + if (member_offset > eb->len) { + btrfs_warn(eb->fs_info, + "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d", + (unsigned long)ptr, eb->start, member_offset, size); + return false; + } + if (member_offset + size > eb->len) { + btrfs_warn(eb->fs_info, + "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d", + (unsigned long)ptr, eb->start, member_offset, size); + return false; + } + + return true; +} + /* - * this is some deeply nasty code. + * Macro templates that define helpers to read/write extent buffer data of a + * given size, that are also used via ctree.h for access to item members by + * specialized helpers. * - * The end result is that anyone who #includes ctree.h gets a - * declaration for the btrfs_set_foo functions and btrfs_foo functions, - * which are wrappers of btrfs_set_token_#bits functions and - * btrfs_get_token_#bits functions, which are defined in this file. + * Generic helpers: + * - btrfs_set_8 (for 8/16/32/64) + * - btrfs_get_8 (for 8/16/32/64) * - * These setget functions do all the extent_buffer related mapping - * required to efficiently read and write specific fields in the extent - * buffers. Every pointer to metadata items in btrfs is really just - * an unsigned long offset into the extent buffer which has been - * cast to a specific type. This gives us all the gcc type checking. + * Generic helpers with a token (cached address of the most recently accessed + * page): + * - btrfs_set_token_8 (for 8/16/32/64) + * - btrfs_get_token_8 (for 8/16/32/64) * - * The extent buffer api is used to do the page spanning work required to - * have a metadata blocksize different from the page size. + * The set/get functions handle data spanning two pages transparently, in case + * metadata block size is larger than page. Every pointer to metadata items is + * an offset into the extent buffer page array, cast to a specific type. This + * gives us all the type checking. * - * There are 2 variants defined, one with a token pointer and one without. + * The extent buffer pages stored in the array pages do not form a contiguous + * phyusical range, but the API functions assume the linear offset to the range + * from 0 to metadata node size. */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ - const void *ptr, unsigned long off, \ - struct btrfs_map_token *token) \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off) \ { \ - unsigned long part_offset = (unsigned long)ptr; \ - unsigned long offset = part_offset + off; \ - void *p; \ - int err; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - int size = sizeof(u##bits); \ - u##bits res; \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long idx = member_offset >> PAGE_SHIFT; \ + const unsigned long oip = offset_in_page(member_offset); \ + const int size = sizeof(u##bits); \ + u8 lebytes[sizeof(u##bits)]; \ + const int part = PAGE_SIZE - oip; \ \ ASSERT(token); \ - ASSERT(token->eb == eb); \ - \ - if (token->kaddr && token->offset <= offset && \ - (token->offset + PAGE_SIZE >= offset + size)) { \ - kaddr = token->kaddr; \ - p = kaddr + part_offset - token->offset; \ - res = get_unaligned_le##bits(p + off); \ - return res; \ + ASSERT(token->kaddr); \ + ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ + if (token->offset <= member_offset && \ + member_offset + size <= token->offset + PAGE_SIZE) { \ + return get_unaligned_le##bits(token->kaddr + oip); \ } \ - err = map_private_extent_buffer(eb, offset, size, \ - &kaddr, &map_start, &map_len); \ - if (err) { \ - __le##bits leres; \ + token->kaddr = page_address(token->eb->pages[idx]); \ + token->offset = idx << PAGE_SHIFT; \ + if (oip + size <= PAGE_SIZE) \ + return get_unaligned_le##bits(token->kaddr + oip); \ \ - read_extent_buffer(eb, &leres, offset, size); \ - return le##bits##_to_cpu(leres); \ - } \ - p = kaddr + part_offset - map_start; \ - res = get_unaligned_le##bits(p + off); \ - token->kaddr = kaddr; \ - token->offset = map_start; \ - return res; \ + memcpy(lebytes, token->kaddr + oip, part); \ + token->kaddr = page_address(token->eb->pages[idx + 1]); \ + token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(lebytes + part, token->kaddr, size - part); \ + return get_unaligned_le##bits(lebytes); \ } \ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ - unsigned long part_offset = (unsigned long)ptr; \ - unsigned long offset = part_offset + off; \ - void *p; \ - int err; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - int size = sizeof(u##bits); \ - u##bits res; \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long oip = offset_in_page(member_offset); \ + const unsigned long idx = member_offset >> PAGE_SHIFT; \ + char *kaddr = page_address(eb->pages[idx]); \ + const int size = sizeof(u##bits); \ + const int part = PAGE_SIZE - oip; \ + u8 lebytes[sizeof(u##bits)]; \ \ - err = map_private_extent_buffer(eb, offset, size, \ - &kaddr, &map_start, &map_len); \ - if (err) { \ - __le##bits leres; \ + ASSERT(check_setget_bounds(eb, ptr, off, size)); \ + if (oip + size <= PAGE_SIZE) \ + return get_unaligned_le##bits(kaddr + oip); \ \ - read_extent_buffer(eb, &leres, offset, size); \ - return le##bits##_to_cpu(leres); \ - } \ - p = kaddr + part_offset - map_start; \ - res = get_unaligned_le##bits(p + off); \ - return res; \ + memcpy(lebytes, kaddr + oip, part); \ + kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(lebytes + part, kaddr, size - part); \ + return get_unaligned_le##bits(lebytes); \ } \ -void btrfs_set_token_##bits(struct extent_buffer *eb, \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off, \ - u##bits val, \ - struct btrfs_map_token *token) \ + u##bits val) \ { \ - unsigned long part_offset = (unsigned long)ptr; \ - unsigned long offset = part_offset + off; \ - void *p; \ - int err; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - int size = sizeof(u##bits); \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long idx = member_offset >> PAGE_SHIFT; \ + const unsigned long oip = offset_in_page(member_offset); \ + const int size = sizeof(u##bits); \ + u8 lebytes[sizeof(u##bits)]; \ + const int part = PAGE_SIZE - oip; \ \ ASSERT(token); \ - ASSERT(token->eb == eb); \ - \ - if (token->kaddr && token->offset <= offset && \ - (token->offset + PAGE_SIZE >= offset + size)) { \ - kaddr = token->kaddr; \ - p = kaddr + part_offset - token->offset; \ - put_unaligned_le##bits(val, p + off); \ + ASSERT(token->kaddr); \ + ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ + if (token->offset <= member_offset && \ + member_offset + size <= token->offset + PAGE_SIZE) { \ + put_unaligned_le##bits(val, token->kaddr + oip); \ return; \ } \ - err = map_private_extent_buffer(eb, offset, size, \ - &kaddr, &map_start, &map_len); \ - if (err) { \ - __le##bits val2; \ - \ - val2 = cpu_to_le##bits(val); \ - write_extent_buffer(eb, &val2, offset, size); \ + token->kaddr = page_address(token->eb->pages[idx]); \ + token->offset = idx << PAGE_SHIFT; \ + if (oip + size <= PAGE_SIZE) { \ + put_unaligned_le##bits(val, token->kaddr + oip); \ return; \ } \ - p = kaddr + part_offset - map_start; \ - put_unaligned_le##bits(val, p + off); \ - token->kaddr = kaddr; \ - token->offset = map_start; \ + put_unaligned_le##bits(val, lebytes); \ + memcpy(token->kaddr + oip, lebytes, part); \ + token->kaddr = page_address(token->eb->pages[idx + 1]); \ + token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(token->kaddr, lebytes + part, size - part); \ } \ -void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ - unsigned long part_offset = (unsigned long)ptr; \ - unsigned long offset = part_offset + off; \ - void *p; \ - int err; \ - char *kaddr; \ - unsigned long map_start; \ - unsigned long map_len; \ - int size = sizeof(u##bits); \ - \ - err = map_private_extent_buffer(eb, offset, size, \ - &kaddr, &map_start, &map_len); \ - if (err) { \ - __le##bits val2; \ + const unsigned long member_offset = (unsigned long)ptr + off; \ + const unsigned long oip = offset_in_page(member_offset); \ + const unsigned long idx = member_offset >> PAGE_SHIFT; \ + char *kaddr = page_address(eb->pages[idx]); \ + const int size = sizeof(u##bits); \ + const int part = PAGE_SIZE - oip; \ + u8 lebytes[sizeof(u##bits)]; \ \ - val2 = cpu_to_le##bits(val); \ - write_extent_buffer(eb, &val2, offset, size); \ + ASSERT(check_setget_bounds(eb, ptr, off, size)); \ + if (oip + size <= PAGE_SIZE) { \ + put_unaligned_le##bits(val, kaddr + oip); \ return; \ } \ - p = kaddr + part_offset - map_start; \ - put_unaligned_le##bits(val, p + off); \ + \ + put_unaligned_le##bits(val, lebytes); \ + memcpy(kaddr + oip, lebytes, part); \ + kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(kaddr, lebytes + part, size - part); \ } DEFINE_BTRFS_SETGET_BITS(8) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7932d8d07cff..bc73fd670702 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -72,23 +72,32 @@ const char * __attribute_const__ btrfs_decode_error(int errno) char *errstr = "unknown"; switch (errno) { - case -EIO: + case -ENOENT: /* -2 */ + errstr = "No such entry"; + break; + case -EIO: /* -5 */ errstr = "IO failure"; break; - case -ENOMEM: + case -ENOMEM: /* -12*/ errstr = "Out of memory"; break; - case -EROFS: - errstr = "Readonly filesystem"; - break; - case -EEXIST: + case -EEXIST: /* -17 */ errstr = "Object already exists"; break; - case -ENOSPC: + case -ENOSPC: /* -28 */ errstr = "No space left"; break; - case -ENOENT: - errstr = "No such entry"; + case -EROFS: /* -30 */ + errstr = "Readonly filesystem"; + break; + case -EOPNOTSUPP: /* -95 */ + errstr = "Operation not supported"; + break; + case -EUCLEAN: /* -117 */ + errstr = "Filesystem corrupted"; + break; + case -EDQUOT: /* -122 */ + errstr = "Quota exceeded"; break; } @@ -1093,10 +1102,7 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); btrfs_release_path(path); - key.objectid = subvol_objectid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - fs_root = btrfs_get_fs_root(fs_info, &key, true); + fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); fs_root = NULL; @@ -1211,7 +1217,6 @@ static int btrfs_fill_super(struct super_block *sb, { struct inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); - struct btrfs_key key; int err; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -1239,10 +1244,7 @@ static int btrfs_fill_super(struct super_block *sb, return err; } - key.objectid = BTRFS_FIRST_FREE_OBJECTID; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(sb, &key, fs_info->fs_root); + inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto fail_close; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2d5498136e5e..b359d4b17658 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -21,6 +21,7 @@ #include "dev-replace.h" #include "qgroup.h" #include "block-group.h" +#include "space-info.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -141,7 +142,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) struct btrfs_block_group, bg_list); list_del_init(&cache->bg_list); - btrfs_put_block_group_trimming(cache); + btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } WARN_ON(!list_empty(&transaction->dev_update_list)); @@ -348,10 +349,10 @@ loop: } /* - * this does all the record keeping required to make sure that a reference - * counted root is properly recorded in a given transaction. This is required - * to make sure the old root from before we joined the transaction is deleted - * when the transaction commits + * This does all the record keeping required to make sure that a shareable root + * is properly recorded in a given transaction. This is required to make sure + * the old root from before we joined the transaction is deleted when the + * transaction commits. */ static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -359,7 +360,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; - if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { WARN_ON(root == fs_info->extent_root); WARN_ON(!force && root->commit_root != root->node); @@ -438,7 +439,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return 0; /* @@ -503,7 +504,7 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; if (!fs_info->reloc_ctl || - !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; @@ -523,6 +524,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, u64 num_bytes = 0; u64 qgroup_reserved = 0; bool reloc_reserved = false; + bool do_chunk_alloc = false; int ret; /* Send isn't supposed to start transactions. */ @@ -563,7 +565,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * refill that amount for whatever is missing in the reserve. */ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); - if (delayed_refs_rsv->full == 0) { + if (flush == BTRFS_RESERVE_FLUSH_ALL && + delayed_refs_rsv->full == 0) { delayed_refs_bytes = num_bytes; num_bytes <<= 1; } @@ -584,6 +587,9 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, delayed_refs_bytes); num_bytes -= delayed_refs_bytes; } + + if (rsv->space_info->force_alloc) + do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && !delayed_refs_rsv->full) { /* @@ -666,6 +672,19 @@ got_it: current->journal_info = h; /* + * If the space_info is marked ALLOC_FORCE then we'll get upgraded to + * ALLOC_FORCE the first run through, and then we won't allocate for + * anybody else who races in later. We don't care about the return + * value here. + */ + if (do_chunk_alloc && num_bytes) { + u64 flags = h->block_rsv->space_info->flags; + + btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), + CHUNK_ALLOC_NO_FORCE); + } + + /* * btrfs_record_root_in_trans() needs to alloc new extents, and may * call btrfs_join_transaction() while we're also starting a * transaction. @@ -699,43 +718,10 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( struct btrfs_root *root, - unsigned int num_items, - int min_factor) + unsigned int num_items) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_trans_handle *trans; - u64 num_bytes; - int ret; - - /* - * We have two callers: unlink and block group removal. The - * former should succeed even if we will temporarily exceed - * quota and the latter operates on the extent root so - * qgroup enforcement is ignored anyway. - */ - trans = start_transaction(root, num_items, TRANS_START, - BTRFS_RESERVE_FLUSH_ALL, false); - if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) - return trans; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return trans; - - num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); - ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv, - num_bytes, min_factor); - if (ret) { - btrfs_end_transaction(trans); - return ERR_PTR(ret); - } - - trans->block_rsv = &fs_info->trans_block_rsv; - trans->bytes_reserved = num_bytes; - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, num_bytes, 1); - - return trans; + return start_transaction(root, num_items, TRANS_START, + BTRFS_RESERVE_FLUSH_ALL_STEAL, false); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) @@ -1644,7 +1630,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_get_fs_root(fs_info, &key, true); + pending->snap = btrfs_get_fs_root(fs_info, objectid, true); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 31ae8d273065..bf102e64bfb2 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -193,8 +193,7 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( struct btrfs_root *root, - unsigned int num_items, - int min_factor); + unsigned int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a92f8a6dd192..517b44300a05 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -957,10 +957,6 @@ static int check_dev_item(struct extent_buffer *leaf, return 0; } -/* Inode item error output has the same format as dir_item_err() */ -#define inode_item_err(eb, slot, fmt, ...) \ - dir_item_err(eb, slot, fmt, __VA_ARGS__) - static int check_inode_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 5f9e2dd413af..16c3a6d2586d 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -35,7 +35,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } - if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) goto out; path = btrfs_alloc_path(); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 02ebdd9edc19..920cee312f4e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -505,13 +505,8 @@ insert: */ if (S_ISREG(btrfs_inode_mode(eb, src_item)) && S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && - ino_size != 0) { - struct btrfs_map_token token; - - btrfs_init_map_token(&token, dst_eb); - btrfs_set_token_inode_size(dst_eb, dst_item, - ino_size, &token); - } + ino_size != 0) + btrfs_set_inode_size(dst_eb, dst_item, ino_size); goto no_copy; } @@ -555,13 +550,9 @@ no_copy: static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { - struct btrfs_key key; struct inode *inode; - key.objectid = objectid; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root); + inode = btrfs_iget(root->fs_info->sb, objectid, root); if (IS_ERR(inode)) inode = NULL; return inode; @@ -3299,6 +3290,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); } @@ -3816,8 +3808,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, found_key.offset = 0; found_key.type = 0; - ret = btrfs_bin_search(path->nodes[0], &found_key, 0, - &start_slot); + ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); if (ret < 0) break; @@ -3853,44 +3844,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * just to say 'this inode exists' and a logging * to say 'update this inode with these values' */ - btrfs_set_token_inode_generation(leaf, item, 0, &token); - btrfs_set_token_inode_size(leaf, item, logged_isize, &token); + btrfs_set_token_inode_generation(&token, item, 0); + btrfs_set_token_inode_size(&token, item, logged_isize); } else { - btrfs_set_token_inode_generation(leaf, item, - BTRFS_I(inode)->generation, - &token); - btrfs_set_token_inode_size(leaf, item, inode->i_size, &token); - } - - btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); - btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); - btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); - btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - - btrfs_set_token_timespec_sec(leaf, &item->atime, - inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->atime, - inode->i_atime.tv_nsec, &token); - - btrfs_set_token_timespec_sec(leaf, &item->mtime, - inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->mtime, - inode->i_mtime.tv_nsec, &token); - - btrfs_set_token_timespec_sec(leaf, &item->ctime, - inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, &item->ctime, - inode->i_ctime.tv_nsec, &token); - - btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), - &token); - - btrfs_set_token_inode_sequence(leaf, item, - inode_peek_iversion(inode), &token); - btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); - btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); - btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); - btrfs_set_token_inode_block_group(leaf, item, 0, &token); + btrfs_set_token_inode_generation(&token, item, + BTRFS_I(inode)->generation); + btrfs_set_token_inode_size(&token, item, inode->i_size); + } + + btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); + btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); + btrfs_set_token_inode_mode(&token, item, inode->i_mode); + btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); + + btrfs_set_token_timespec_sec(&token, &item->atime, + inode->i_atime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->atime, + inode->i_atime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->mtime, + inode->i_mtime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->mtime, + inode->i_mtime.tv_nsec); + + btrfs_set_token_timespec_sec(&token, &item->ctime, + inode->i_ctime.tv_sec); + btrfs_set_token_timespec_nsec(&token, &item->ctime, + inode->i_ctime.tv_nsec); + + btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + + btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); + btrfs_set_token_inode_transid(&token, item, trans->transid); + btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + btrfs_set_token_inode_block_group(&token, item, 0); } static int log_inode_item(struct btrfs_trans_handle *trans, @@ -3916,9 +3904,21 @@ static int log_csums(struct btrfs_trans_handle *trans, struct btrfs_root *log_root, struct btrfs_ordered_sum *sums) { + const u64 lock_end = sums->bytenr + sums->len - 1; + struct extent_state *cached_state = NULL; int ret; /* + * Serialize logging for checksums. This is to avoid racing with the + * same checksum being logged by another task that is logging another + * file which happens to refer to the same extent as well. Such races + * can leave checksum items in the log with overlapping ranges. + */ + ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr, + lock_end, &cached_state); + if (ret) + return ret; + /* * Due to extent cloning, we might have logged a csum item that covers a * subrange of a cloned extent, and later we can end up logging a csum * item for a larger subrange of the same extent or the entire range. @@ -3928,10 +3928,13 @@ static int log_csums(struct btrfs_trans_handle *trans, * trim and adjust) any existing csum items in the log for this range. */ ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); - if (ret) - return ret; + if (!ret) + ret = btrfs_csum_file_blocks(trans, log_root, sums); - return btrfs_csum_file_blocks(trans, log_root, sums); + unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end, + &cached_state); + + return ret; } static noinline int copy_items(struct btrfs_trans_handle *trans, @@ -4164,43 +4167,35 @@ static int log_one_extent(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, - &token); + btrfs_set_token_file_extent_generation(&token, fi, trans->transid); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC, - &token); + btrfs_set_token_file_extent_type(&token, fi, + BTRFS_FILE_EXTENT_PREALLOC); else - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG, - &token); + btrfs_set_token_file_extent_type(&token, fi, + BTRFS_FILE_EXTENT_REG); block_len = max(em->block_len, em->orig_block_len); if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, - em->block_start, - &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); + btrfs_set_token_file_extent_disk_bytenr(&token, fi, + em->block_start); + btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + btrfs_set_token_file_extent_disk_bytenr(&token, fi, em->block_start - - extent_offset, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); + extent_offset); + btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len); } else { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, - &token); - } - - btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); - btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); - btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); - btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, - &token); - btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); - btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0); + btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0); + } + + btrfs_set_token_file_extent_offset(&token, fi, extent_offset); + btrfs_set_token_file_extent_num_bytes(&token, fi, em->len); + btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes); + btrfs_set_token_file_extent_compression(&token, fi, em->compress_type); + btrfs_set_token_file_extent_encryption(&token, fi, 0); + btrfs_set_token_file_extent_other_encoding(&token, fi, 0); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -4336,12 +4331,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } } } - if (ins_nr > 0) { + if (ins_nr > 0) ret = copy_items(trans, inode, dst_path, path, start_slot, ins_nr, 1, 0); - if (ret > 0) - ret = 0; - } out: btrfs_release_path(path); btrfs_free_path(dst_path); @@ -4835,10 +4827,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, btrfs_release_path(path); - key.objectid = ino; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - inode = btrfs_iget(fs_info->sb, &key, root); + inode = btrfs_iget(fs_info->sb, ino, root); /* * If the other inode that had a conflicting dir entry was * deleted in the current transaction, we need to log its parent @@ -4847,8 +4836,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { ret = PTR_ERR(inode); if (ret == -ENOENT) { - key.objectid = parent; - inode = btrfs_iget(fs_info->sb, &key, root); + inode = btrfs_iget(fs_info->sb, parent, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); } else { @@ -5587,7 +5575,7 @@ process_leaf: continue; btrfs_release_path(path); - di_inode = btrfs_iget(fs_info->sb, &di_key, root); + di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); goto next_dir_inode; @@ -5713,7 +5701,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, cur_offset = item_size; } - dir_inode = btrfs_iget(fs_info->sb, &inode_key, root); + dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid, + root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent @@ -5780,14 +5769,17 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, int slot = path->slots[0]; struct btrfs_key search_key; struct inode *inode; + u64 ino; int ret = 0; btrfs_release_path(path); + ino = found_key.offset; + search_key.objectid = found_key.offset; search_key.type = BTRFS_INODE_ITEM_KEY; search_key.offset = 0; - inode = btrfs_iget(fs_info->sb, &search_key, root); + inode = btrfs_iget(fs_info->sb, ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -6132,7 +6124,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_key tmp_key; struct btrfs_root *log; struct btrfs_fs_info *fs_info = log_root_tree->fs_info; struct walk_control wc = { @@ -6194,11 +6185,8 @@ again: goto error; } - tmp_key.objectid = found_key.offset; - tmp_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_key.offset = (u64)-1; - - wc.replay_dest = btrfs_get_fs_root(fs_info, &tmp_key, true); + wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset, + true); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 76671a6bcb61..28525ad7ff8c 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -257,7 +257,6 @@ out: static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, u64 subvolid) { - struct btrfs_key key; int ret = 0; struct btrfs_root *subvol_root; @@ -265,10 +264,7 @@ static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) goto out; - key.objectid = subvolid; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - subvol_root = btrfs_get_fs_root(fs_info, &key, true); + subvol_root = btrfs_get_fs_root(fs_info, subvolid, true); if (IS_ERR(subvol_root)) { ret = PTR_ERR(subvol_root); if (ret == -ENOENT) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c1909e5f4506..0d6e785bcb98 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -280,10 +280,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * ============ * * uuid_mutex - * volume_mutex - * device_list_mutex - * chunk_mutex - * balance_mutex + * device_list_mutex + * chunk_mutex + * balance_mutex * * * Exclusive operations, BTRFS_FS_EXCL_OP @@ -1042,6 +1041,8 @@ again: &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state) && + !test_bit(BTRFS_DEV_STATE_MISSING, + &device->dev_state) && (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; @@ -1185,7 +1186,6 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; - int ret = 0; flags |= FMODE_EXCL; @@ -1198,16 +1198,15 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, device->generation > latest_dev->generation) latest_dev = device; } - if (fs_devices->open_devices == 0) { - ret = -EINVAL; - goto out; - } + if (fs_devices->open_devices == 0) + return -EINVAL; + fs_devices->opened = 1; fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; -out: - return ret; + + return 0; } static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1251,49 +1250,48 @@ void btrfs_release_disk_super(struct btrfs_super_block *super) put_page(page); } -static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, - struct page **page, - struct btrfs_super_block **disk_super) +static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + u64 bytenr) { + struct btrfs_super_block *disk_super; + struct page *page; void *p; pgoff_t index; /* make sure our super fits in the device */ if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) - return 1; + return ERR_PTR(-EINVAL); /* make sure our super fits in the page */ - if (sizeof(**disk_super) > PAGE_SIZE) - return 1; + if (sizeof(*disk_super) > PAGE_SIZE) + return ERR_PTR(-EINVAL); /* make sure our super doesn't straddle pages on disk */ index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) - return 1; + if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) + return ERR_PTR(-EINVAL); /* pull in the page with our super */ - *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, - index, GFP_KERNEL); + page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); - if (IS_ERR(*page)) - return 1; + if (IS_ERR(page)) + return ERR_CAST(page); - p = page_address(*page); + p = page_address(page); /* align our pointer to the offset of the super block */ - *disk_super = p + offset_in_page(bytenr); + disk_super = p + offset_in_page(bytenr); - if (btrfs_super_bytenr(*disk_super) != bytenr || - btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { + if (btrfs_super_bytenr(disk_super) != bytenr || + btrfs_super_magic(disk_super) != BTRFS_MAGIC) { btrfs_release_disk_super(p); - return 1; + return ERR_PTR(-EINVAL); } - if ((*disk_super)->label[0] && - (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) - (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; + if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) + disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; - return 0; + return disk_super; } int btrfs_forget_devices(const char *path) @@ -1319,7 +1317,6 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct block_device *bdev; - struct page *page; u64 bytenr; lockdep_assert_held(&uuid_mutex); @@ -1337,8 +1334,9 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, if (IS_ERR(bdev)) return ERR_CAST(bdev); - if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) { - device = ERR_PTR(-EINVAL); + disk_super = btrfs_read_disk_super(bdev, bytenr); + if (IS_ERR(disk_super)) { + device = ERR_CAST(disk_super); goto error_bdev_put; } @@ -2663,8 +2661,18 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path ret = btrfs_commit_transaction(trans); } - /* Update ctime/mtime for libblkid */ + /* + * Now that we have written a new super block to this device, check all + * other fs_devices list if device_path alienates any other scanned + * device. + * We can ignore the return value as it typically returns -EINVAL and + * only succeeds if the device was an alien. + */ + btrfs_forget_devices(device_path); + + /* Update ctime/mtime for blkid or udev */ update_dev_time(device_path); + return ret; error_sysfs: diff --git a/fs/direct-io.c b/fs/direct-io.c index 6d5370eac2a8..1543b5af400e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index fd3bd06fabb6..ec7b78e6feca 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -59,7 +59,7 @@ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) EXPORT_SYMBOL_GPL(iomap_dio_iopoll); static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, - struct bio *bio) + struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); @@ -67,7 +67,12 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, bio_set_polled(bio, dio->iocb); dio->submit.last_queue = bdev_get_queue(iomap->bdev); - dio->submit.cookie = submit_bio(bio); + if (dio->dops && dio->dops->submit_io) + dio->submit.cookie = dio->dops->submit_io( + file_inode(dio->iocb->ki_filp), + iomap, bio, pos); + else + dio->submit.cookie = submit_bio(bio); } static ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -191,7 +196,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, get_page(page); __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - iomap_dio_submit_bio(dio, iomap, bio); + iomap_dio_submit_bio(dio, iomap, bio, pos); } static loff_t @@ -299,11 +304,11 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, } dio->size += n; - pos += n; copied += n; nr_pages = iov_iter_npages(dio->submit.iter, BIO_MAX_PAGES); - iomap_dio_submit_bio(dio, iomap, bio); + iomap_dio_submit_bio(dio, iomap, bio, pos); + pos += n; } while (nr_pages); /* @@ -411,8 +416,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct blk_plug plug; struct iomap_dio *dio; - lockdep_assert_held(&inode->i_rwsem); - if (!count) return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index 683ff5fd8871..91676d4b2dfe 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -169,6 +169,14 @@ static inline void bio_advance_iter(const struct bio *bio, #define bio_for_each_bvec(bvl, bio, iter) \ __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter) +/* + * Iterate over all multi-page bvecs. Drivers shouldn't use this version for the + * same reasons as bio_for_each_segment_all(). + */ +#define bio_for_each_bvec_all(bvl, bio, i) \ + for (i = 0, bvl = bio_first_bvec_all(bio); \ + i < (bio)->bi_vcnt; i++, bvl++) \ + #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) static inline unsigned bio_segments(struct bio *bio) diff --git a/include/linux/fs.h b/include/linux/fs.h index f3e167ffbb74..cffc3619eed5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3148,6 +3148,8 @@ extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); extern int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *count, unsigned int flags); +extern ssize_t generic_file_buffered_read(struct kiocb *iocb, + struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); @@ -3209,8 +3211,6 @@ enum { DIO_SKIP_HOLES = 0x02, }; -void dio_end_io(struct bio *bio); - ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index bc20bd04c2a2..a5c219c29b10 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -251,6 +251,8 @@ int iomap_writepages(struct address_space *mapping, struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); + blk_qc_t (*submit_io)(struct inode *inode, struct iomap *iomap, + struct bio *bio, loff_t file_offset); }; ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index bcbc763b8814..360b0f9d2220 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -89,6 +89,7 @@ TRACE_DEFINE_ENUM(COMMIT_TRANS); { IO_TREE_TRANS_DIRTY_PAGES, "TRANS_DIRTY_PAGES" }, \ { IO_TREE_ROOT_DIRTY_LOG_PAGES, "ROOT_DIRTY_LOG_PAGES" }, \ { IO_TREE_INODE_FILE_EXTENT, "INODE_FILE_EXTENT" }, \ + { IO_TREE_LOG_CSUM_RANGE, "LOG_CSUM_RANGE" }, \ { IO_TREE_SELFTEST, "SELFTEST" }) #define BTRFS_GROUP_FLAGS \ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 8e322e2c7e78..a3f3975df0de 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -519,15 +519,6 @@ struct btrfs_extent_inline_ref { __le64 offset; } __attribute__ ((__packed__)); -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { - __le64 root; - __le64 generation; - __le64 objectid; - __le32 count; -} __attribute__ ((__packed__)); - - /* dev extents record free space on individual devices. The owner * field points back to the chunk allocation mapping tree that allocated * the extent. The chunk tree uuid field is a way to double check the owner diff --git a/mm/filemap.c b/mm/filemap.c index fe079e9219d1..3430280df607 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1991,7 +1991,7 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) * * total number of bytes copied, including those the were already @written * * negative error code if nothing was copied */ -static ssize_t generic_file_buffered_read(struct kiocb *iocb, +ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; @@ -2243,6 +2243,7 @@ out: file_accessed(filp); return written ? written : error; } +EXPORT_SYMBOL_GPL(generic_file_buffered_read); /** * generic_file_read_iter - generic filesystem read routine |