diff options
Diffstat (limited to 'fs')
73 files changed, 2702 insertions, 2662 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index aac240430efe..ce083e99ef68 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) return atomic_read(&wq->pending) > wq->thresh * 2; } +static void btrfs_init_workqueue(struct btrfs_workqueue *wq, + struct btrfs_fs_info *fs_info) +{ + wq->fs_info = fs_info; + atomic_set(&wq->pending, 0); + INIT_LIST_HEAD(&wq->ordered_list); + spin_lock_init(&wq->list_lock); + spin_lock_init(&wq->thres_lock); +} + struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) @@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, if (!ret) return NULL; - ret->fs_info = fs_info; + btrfs_init_workqueue(ret, fs_info); + ret->limit_active = limit_active; - atomic_set(&ret->pending, 0); if (thresh == 0) thresh = DFT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ @@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, return NULL; } - INIT_LIST_HEAD(&ret->ordered_list); - spin_lock_init(&ret->list_lock); - spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name); + return ret; +} + +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags) +{ + struct btrfs_workqueue *ret; + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + btrfs_init_workqueue(ret, fs_info); + + /* Ordered workqueues don't allow @max_active adjustments. */ + ret->limit_active = 1; + ret->current_active = 1; + ret->thresh = NO_THRESHOLD; + + ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name); + if (!ret->normal_wq) { + kfree(ret); + return NULL; + } + trace_btrfs_workqueue_alloc(ret, name); return ret; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 6e2596ddae10..30f66c5e2e6e 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, unsigned int flags, int limit_active, int thresh); +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b3ad0f51e616..12b12443efaa 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -27,6 +27,17 @@ struct btrfs_failed_bio { atomic_t repair_count; }; +/* Is this a data path I/O that needs storage layer checksum and repair? */ +static inline bool is_data_bbio(struct btrfs_bio *bbio) +{ + return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); +} + +static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +{ + return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; +} + /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. @@ -61,20 +72,6 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, return bbio; } -static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) -{ - struct btrfs_ordered_extent *ordered; - int ret; - - ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; - ret = btrfs_extract_ordered_extent(bbio, ordered); - btrfs_put_ordered_extent(ordered); - - return errno_to_blk_status(ret); -} - static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, u64 map_length, bool use_append) @@ -95,13 +92,41 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; bbio->file_offset = orig_bbio->file_offset; - if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) - orig_bbio->file_offset += map_length; - + orig_bbio->file_offset += map_length; + if (bbio_has_ordered_extent(bbio)) { + refcount_inc(&orig_bbio->ordered->refs); + bbio->ordered = orig_bbio->ordered; + } atomic_inc(&orig_bbio->pending_ios); return bbio; } +/* Free a bio that was never submitted to the underlying device. */ +static void btrfs_cleanup_bio(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); +} + +static void __btrfs_bio_end_io(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } +} + +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + __btrfs_bio_end_io(bbio); +} + static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -130,12 +155,12 @@ static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) if (bbio->bio.bi_status) btrfs_bbio_propagate_error(bbio, orig_bbio); - bio_put(&bbio->bio); + btrfs_cleanup_bio(bbio); bbio = orig_bbio; } if (atomic_dec_and_test(&bbio->pending_ios)) - bbio->end_io(bbio); + __btrfs_bio_end_io(bbio); } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -327,7 +352,7 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); /* Metadata reads are checked and repaired by the submitter. */ - if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) + if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else btrfs_orig_bbio_end_io(bbio); @@ -348,7 +373,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_orig_bbio_end_io(bbio); } @@ -361,8 +386,7 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - if (bio_op(bio) == REQ_OP_READ && bbio->inode && - !(bbio->bio.bi_opf & REQ_META)) + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else btrfs_orig_bbio_end_io(bbio); @@ -472,13 +496,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - /* Do not leak our private flag into the block layer. */ - bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; - if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; btrfs_submit_dev_bio(smap->dev, bio); @@ -574,27 +597,20 @@ static void run_one_async_free(struct btrfs_work *work) static bool should_async_write(struct btrfs_bio *bbio) { - /* - * If the I/O is not issued by fsync and friends, (->sync_writers != 0), - * then try to defer the submission to a workqueue to parallelize the - * checksum calculation. - */ - if (atomic_read(&bbio->inode->sync_writers)) + /* Submit synchronously if the checksum implementation is fast. */ + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* - * Submit metadata writes synchronously if the checksum implementation - * is fast, or we are on a zoned device that wants I/O to be submitted - * in order. + * Try to defer the submission to a workqueue to parallelize the + * checksum calculation unless the I/O is issued synchronously. */ - if (bbio->bio.bi_opf & REQ_META) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + if (op_is_sync(bbio->bio.bi_opf)) + return false; - if (btrfs_is_zoned(fs_info)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - } + /* Zoned devices require I/O to be submitted in order. */ + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + return false; return true; } @@ -622,10 +638,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); - if (op_is_sync(bbio->bio.bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); return true; } @@ -635,7 +648,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; - u64 logical = bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; u64 map_length = length; bool use_append = btrfs_use_zone_append(bbio); @@ -645,8 +658,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) int error; btrfs_bio_counter_inc_blocked(fs_info); - error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); if (error) { ret = errno_to_blk_status(error); goto fail; @@ -665,7 +678,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * Save the iter for the end_io handler and preload the checksums for * data reads. */ - if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) @@ -676,9 +689,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (use_append) { bio->bi_opf &= ~REQ_OP_WRITE; bio->bi_opf |= REQ_OP_ZONE_APPEND; - ret = btrfs_bio_extract_ordered_extent(bbio); - if (ret) - goto fail_put_bio; } /* @@ -695,6 +705,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; + } else if (use_append) { + ret = btrfs_alloc_dummy_sum(bbio); + if (ret) + goto fail_put_bio; } } @@ -704,7 +718,7 @@ done: fail_put_bio: if (map_length < length) - bio_put(bio); + btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); btrfs_bio_end_io(orig_bbio, ret); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index a8eca3a65673..ca79decee060 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -39,8 +39,8 @@ struct btrfs_bio { union { /* - * Data checksumming and original I/O information for internal - * use in the btrfs_submit_bio machinery. + * For data reads: checksumming and original I/O information. + * (for internal use in the btrfs_submit_bio machinery only) */ struct { u8 *csum; @@ -48,7 +48,20 @@ struct btrfs_bio { struct bvec_iter saved_iter; }; - /* For metadata parentness verification. */ + /* + * For data writes: + * - ordered extent covering the bio + * - pointer to the checksums for this bio + * - original physical address from the allocator + * (for zone append only) + */ + struct { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_sum *sums; + u64 orig_physical; + }; + + /* For metadata reads: parentness verification. */ struct btrfs_tree_parent_check parent_check; }; @@ -84,15 +97,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private); - -static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - bbio->end_io(bbio); -} - -/* Bio only refers to one ordered extent. */ -#define REQ_BTRFS_ONE_ORDERED REQ_DRV +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e97af2e510c3..48ae509f2ac2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -95,14 +95,21 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) } allowed &= flags; - if (allowed & BTRFS_BLOCK_GROUP_RAID6) + /* Select the highest-redundancy RAID level. */ + if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) + allowed = BTRFS_BLOCK_GROUP_RAID1C4; + else if (allowed & BTRFS_BLOCK_GROUP_RAID6) allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) + allowed = BTRFS_BLOCK_GROUP_RAID1C3; else if (allowed & BTRFS_BLOCK_GROUP_RAID5) allowed = BTRFS_BLOCK_GROUP_RAID5; else if (allowed & BTRFS_BLOCK_GROUP_RAID10) allowed = BTRFS_BLOCK_GROUP_RAID10; else if (allowed & BTRFS_BLOCK_GROUP_RAID1) allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_DUP) + allowed = BTRFS_BLOCK_GROUP_DUP; else if (allowed & BTRFS_BLOCK_GROUP_RAID0) allowed = BTRFS_BLOCK_GROUP_RAID0; @@ -1633,11 +1640,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; + trace_btrfs_add_unused_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); - trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } else { + /* Pull out the block group from the reclaim_bgs list. */ + list_move_tail(&bg->bg_list, &fs_info->unused_bgs); } spin_unlock(&fs_info->unused_bgs_lock); } @@ -1791,8 +1801,15 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_unlock(&bg->lock); - /* Get out fast, in case we're unmounting the filesystem */ - if (btrfs_fs_closing(fs_info)) { + /* + * Get out fast, in case we're read-only or unmounting the + * filesystem. It is OK to drop block groups from the list even + * for the read-only case. As we did sb_start_write(), + * "mount -o remount,ro" won't happen and read-only filesystem + * means it is forced read-only due to a fatal error. So, it + * never gets back to read-write to let us reclaim again. + */ + if (btrfs_need_cleaner_sleep(fs_info)) { up_write(&space_info->groups_sem); goto next; } @@ -1823,11 +1840,27 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: + if (ret) + btrfs_mark_bg_to_reclaim(bg); btrfs_put_block_group(bg); + + mutex_unlock(&fs_info->reclaim_bgs_lock); + /* + * Reclaiming all the block groups in the list can take really + * long. Prioritize cleaning up unused block groups. + */ + btrfs_delete_unused_bgs(fs_info); + /* + * If we are interrupted by a balance, we can just bail out. The + * cleaner thread restart again if necessary. + */ + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) + goto end; spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); +end: btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } @@ -3521,9 +3554,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_dirty(&trans->transaction->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); + set_extent_bit(&trans->transaction->pinned_extents, + bytenr, bytenr + num_bytes - 1, + EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index cc0e4b37db2d..f204addc3fe8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -162,7 +162,14 @@ struct btrfs_block_group { */ struct list_head cluster_list; - /* For delayed block group creation or deletion of empty block groups */ + /* + * Used for several lists: + * + * 1) struct btrfs_fs_info::unused_bgs + * 2) struct btrfs_fs_info::reclaim_bgs + * 3) struct btrfs_transaction::deleted_bgs + * 4) struct btrfs_trans_handle::new_bgs + */ struct list_head bg_list; /* For read-only block groups */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ac18c43fadad..6279d200cf83 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -541,3 +541,22 @@ try_reserve: return ERR_PTR(ret); } + +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + u64 needed_bytes; + int ret; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + + btrfs_calc_metadata_size(fs_info, 1); + + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return ret; +} diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 6dc781709aca..b0bd12b8652f 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -82,6 +82,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ec2ae4406c16..d47a927b3504 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -116,9 +116,6 @@ struct btrfs_inode { unsigned long runtime_flags; - /* Keep track of who's O_SYNC/fsyncing currently */ - atomic_t sync_writers; - /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ @@ -335,7 +332,7 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, if (btrfs_is_free_space_inode(inode)) return; trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), - mod); + mod, inode->outstanding_extents); } /* @@ -407,30 +404,12 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } -/* - * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two - * separate u32s. These two functions convert between the two representations. - */ -static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) -{ - return (flags | ((u64)ro_flags << 32)); -} - -static inline void btrfs_inode_split_flags(u64 inode_item_flags, - u32 *flags, u32 *ro_flags) -{ - *flags = (u32)inode_item_flags; - *ro_flags = (u32)(inode_item_flags >> 32); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 82e49d985019..3caf339c4bb3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1459,13 +1459,13 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_fs_info *fs_info = state->fs_info; int ret; u64 length; - struct btrfs_io_context *multi = NULL; + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap, *map; struct btrfs_device *device; length = len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - bytenr, &length, &multi, mirror_num); - + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, + NULL, &mirror_num, 0); if (ret) { block_ctx_out->start = 0; block_ctx_out->dev_bytenr = 0; @@ -1478,21 +1478,26 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, return ret; } - device = multi->stripes[0].dev; + if (bioc) + map = &bioc->stripes[0]; + else + map = &smap; + + device = map->dev; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || !device->bdev || !device->name) block_ctx_out->dev = NULL; else block_ctx_out->dev = btrfsic_dev_state_lookup( device->bdev->bd_dev); - block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->dev_bytenr = map->physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; block_ctx_out->datav = NULL; block_ctx_out->pagev = NULL; block_ctx_out->mem_to_free = NULL; - kfree(multi); + kfree(bioc); if (NULL == block_ctx_out->dev) { ret = -ENXIO; pr_info("btrfsic: error, cannot lookup dev (#1)!\n"); @@ -1565,7 +1570,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = dev_bytenr >> 9; + bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2d0493f0a184..8818ed5c390f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -37,7 +37,7 @@ #include "file-item.h" #include "super.h" -struct bio_set btrfs_compressed_bioset; +static struct bio_set btrfs_compressed_bioset; static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -211,8 +211,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) for (i = 0; i < ret; i++) { struct folio *folio = fbatch.folios[i]; - if (errno) - folio_set_error(folio); btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); } @@ -226,13 +224,8 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) struct compressed_bio *cb = container_of(work, struct compressed_bio, write_end_work); - /* - * Ok, we're the last bio for this extent, step one is to call back - * into the FS and do all the end_io operations. - */ - btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL, - cb->start, cb->start + cb->len - 1, - cb->bbio.bio.bi_status == BLK_STS_OK); + btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, + cb->bbio.bio.bi_status == BLK_STS_OK); if (cb->writeback) end_compressed_writeback(cb); @@ -281,32 +274,31 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) * This also checksums the file bytes and gets things ready for * the end io hooks. */ -void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned int len, u64 disk_start, - unsigned int compressed_len, - struct page **compressed_pages, - unsigned int nr_pages, - blk_opf_t write_flags, - bool writeback) +void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, + struct page **compressed_pages, + unsigned int nr_pages, + blk_opf_t write_flags, + bool writeback) { + struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct compressed_bio *cb; - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(len, fs_info->sectorsize)); - - write_flags |= REQ_BTRFS_ONE_ORDERED; + ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); - cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, + cb = alloc_compressed_bio(inode, ordered->file_offset, + REQ_OP_WRITE | write_flags, end_compressed_bio_write); - cb->start = start; - cb->len = len; + cb->start = ordered->file_offset; + cb->len = ordered->num_bytes; cb->compressed_pages = compressed_pages; - cb->compressed_len = compressed_len; + cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; + cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; + cb->bbio.ordered = ordered; btrfs_add_compressed_bio_pages(cb); btrfs_submit_bio(&cb->bbio, 0); @@ -421,7 +413,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ if (!em || cur < em->start || (cur + fs_info->sectorsize > extent_map_end(em)) || - (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) { + (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); unlock_page(page); @@ -472,7 +464,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_compressed_read(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -538,7 +530,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, mirror_num); + btrfs_submit_bio(&cb->bbio, 0); return; out_free_compressed_pages: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 19ab2abeddc0..03bb9d143fa7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -10,6 +10,7 @@ #include "bio.h" struct btrfs_inode; +struct btrfs_ordered_extent; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -86,14 +87,12 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); -void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned int len, u64 disk_start, - unsigned int compressed_len, +void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, struct page **compressed_pages, unsigned int nr_pages, blk_opf_t write_flags, bool writeback); -void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2ff2961b1183..a4cb4b642987 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,8 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot); static const struct btrfs_csums { u16 size; @@ -150,13 +148,19 @@ static inline void copy_leaf_items(const struct extent_buffer *dst, nr_items * sizeof(struct btrfs_item)); } +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); /* * csum type is validated at mount time */ - return btrfs_csums[t].size; + return btrfs_csum_type_size(t); } const char *btrfs_super_csum_name(u16 csum_type) @@ -417,9 +421,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, &refs, &flags); if (ret) return ret; - if (refs == 0) { - ret = -EROFS; - btrfs_handle_fs_error(fs_info, ret, NULL); + if (unlikely(refs == 0)) { + btrfs_crit(fs_info, + "found 0 references for tree block at bytenr %llu level %d root %llu", + buf->start, btrfs_header_level(buf), + btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); return ret; } } else { @@ -464,10 +472,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; } if (new_flags != 0) { - int level = btrfs_header_level(buf); - - ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level); + ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); if (ret) return ret; } @@ -583,9 +588,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; - atomic_inc(&cow->refs); ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + atomic_inc(&cow->refs); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, btrfs_root_id(root), buf, @@ -594,8 +604,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - btrfs_tree_mod_log_insert_key(parent, parent_slot, - BTRFS_MOD_LOG_KEY_REPLACE); + ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -1028,8 +1044,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); - btrfs_handle_fs_error(fs_info, ret, NULL); - goto enospc; + goto out; } btrfs_tree_lock(child); @@ -1038,11 +1053,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); - goto enospc; + goto out; } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + goto out; + } rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -1070,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(left)) { ret = PTR_ERR(left); left = NULL; - goto enospc; + goto out; } __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); @@ -1079,7 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } @@ -1088,7 +1108,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(right)) { ret = PTR_ERR(right); right = NULL; - goto enospc; + goto out; } __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); @@ -1097,7 +1117,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } @@ -1119,7 +1139,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); - del_ptr(root, path, level + 1, pslot + 1); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); + if (ret < 0) { + free_extent_buffer_stale(right); + right = NULL; + goto out; + } root_sub_used(root, right->len); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); @@ -1130,7 +1155,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1145,15 +1173,19 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - if (!left) { - ret = -EROFS; - btrfs_handle_fs_error(fs_info, ret, NULL); - goto enospc; + if (unlikely(!left)) { + btrfs_crit(fs_info, +"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", + parent->start, btrfs_header_level(parent), + mid->start, btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; } wret = balance_node_right(trans, mid, left); if (wret < 0) { ret = wret; - goto enospc; + goto out; } if (wret == 1) { wret = push_node_left(trans, left, mid, 1); @@ -1165,7 +1197,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); - del_ptr(root, path, level + 1, pslot); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); + if (ret < 0) { + free_extent_buffer_stale(mid); + mid = NULL; + goto out; + } root_sub_used(root, mid->len); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); @@ -1176,7 +1213,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -1202,7 +1242,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); -enospc: +out: if (right) { btrfs_tree_unlock(right); free_extent_buffer(right); @@ -1278,7 +1318,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(left); + free_extent_buffer(left); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1333,7 +1378,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2379,6 +2429,87 @@ done: } /* + * Search the tree again to find a leaf with smaller keys. + * Returns 0 if it found something. + * Returns 1 if there are no smaller keys. + * Returns < 0 on error. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_key orig_key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; + + if (key.offset > 0) { + key.offset--; + } else if (key.type > 0) { + key.type--; + key.offset = (u64)-1; + } else if (key.objectid > 0) { + key.objectid--; + key.type = (u8)-1; + key.offset = (u64)-1; + } else { + return 1; + } + + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret <= 0) + return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) + return 0; + return 1; +} + +/* * helper to use instead of search slot if no exact match is needed but * instead the next or previous item should be returned. * When find_higher is true, the next higher item is returned, the next lower @@ -2552,6 +2683,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", slot, btrfs_disk_key_objectid(&disk_key), @@ -2559,13 +2691,13 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_disk_key_offset(&disk_key), new_key->objectid, new_key->type, new_key->offset); - btrfs_print_leaf(eb); BUG(); } } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", slot, btrfs_disk_key_objectid(&disk_key), @@ -2573,7 +2705,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_disk_key_offset(&disk_key), new_key->objectid, new_key->type, new_key->offset); - btrfs_print_leaf(eb); BUG(); } } @@ -2626,7 +2757,7 @@ static bool check_sibling_keys(struct extent_buffer *left, btrfs_item_key_to_cpu(right, &right_first, 0); } - if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { btrfs_crit(left->fs_info, "left extent buffer:"); btrfs_print_tree(left, false); btrfs_crit(left->fs_info, "right extent buffer:"); @@ -2703,8 +2834,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, if (push_items < src_nritems) { /* - * Don't call btrfs_tree_mod_log_insert_move() here, key removal - * was already fully logged by btrfs_tree_mod_log_eb_copy() above. + * btrfs_tree_mod_log_eb_copy handles logging the move, so we + * don't need to do an explicit tree mod log operation for it. */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), btrfs_node_key_ptr_offset(src, push_items), @@ -2765,8 +2896,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); - BUG_ON(ret < 0); + + /* + * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't + * need to do an explicit tree mod log operation for it. + */ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * @@ -2840,7 +2974,12 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, old = root->node; ret = btrfs_tree_mod_log_insert_root(root->node, c, false); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + btrfs_tree_unlock(c); + free_extent_buffer(c); + return ret; + } rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -2861,10 +3000,10 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. */ -static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) +static int insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; @@ -2880,7 +3019,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, slot, nritems - slot); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(lower, slot + 1), @@ -2890,7 +3032,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, BTRFS_MOD_LOG_KEY_ADD); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); @@ -2898,6 +3043,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); + + return 0; } /* @@ -2962,6 +3109,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { + btrfs_tree_unlock(split); + free_extent_buffer(split); btrfs_abort_transaction(trans, ret); return ret; } @@ -2975,8 +3124,13 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - insert_ptr(trans, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); + ret = insert_ptr(trans, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); + if (ret < 0) { + btrfs_tree_unlock(split); + free_extent_buffer(split); + return ret; + } if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2996,7 +3150,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * and nr indicate which items in the leaf to check. This totals up the * space used both by the item structs and the item data */ -static int leaf_space_used(struct extent_buffer *l, int start, int nr) +static int leaf_space_used(const struct extent_buffer *l, int start, int nr) { int data_len; int nritems = btrfs_header_nritems(l); @@ -3016,7 +3170,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) +int btrfs_leaf_free_space(const struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; int nritems = btrfs_header_nritems(leaf); @@ -3453,16 +3607,17 @@ out: * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. */ -static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { struct btrfs_fs_info *fs_info = trans->fs_info; int data_copy_size; int rt_data_off; int i; + int ret; struct btrfs_disk_key disk_key; struct btrfs_map_token token; @@ -3487,7 +3642,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + if (ret < 0) + return ret; btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -3505,6 +3662,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); + + return 0; } /* @@ -3703,8 +3862,13 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, path, &disk_key, - right->start, path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1] + 1, 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3712,8 +3876,13 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, path, &disk_key, - right->start, path->slots[1], 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1], 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3729,7 +3898,12 @@ again: return ret; } - copy_for_split(trans, path, l, right, slot, mid, nritems); + ret = copy_for_split(trans, path, l, right, slot, mid, nritems); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (split == 2) { BUG_ON(num_doubles != 0); @@ -3826,7 +4000,12 @@ static noinline int split_item(struct btrfs_path *path, struct btrfs_disk_key disk_key; leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); + /* + * Shouldn't happen because the caller must have previously called + * setup_leaf_for_split() to make room for the new item in the leaf. + */ + if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) + return -ENOSPC; orig_slot = path->slots[0]; orig_offset = btrfs_item_offset(leaf, path->slots[0]); @@ -4273,9 +4452,11 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * * the tree should have been previously balanced so the deletion does not * empty a node. + * + * This is exported for use inside btrfs-progs, don't un-export it. */ -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot) +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4286,7 +4467,10 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot, slot + 1, nritems - slot - 1); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(parent, slot), @@ -4296,7 +4480,10 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, BTRFS_MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } nritems--; @@ -4312,6 +4499,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, fixup_low_keys(path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); + return 0; } /* @@ -4324,13 +4512,17 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { + int ret; + WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(root, path, 1, path->slots[1]); + ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); + if (ret < 0) + return ret; /* * btrfs_free_extent is expensive, we want to make sure we @@ -4343,6 +4535,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); + return 0; } /* * delete the item at the leaf level in path. If that empties @@ -4392,7 +4585,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_clear_buffer_dirty(trans, leaf); - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -4416,7 +4611,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* push_leaf_left fixes the path. * make sure the path still points to our leaf - * for possible call to del_ptr below + * for possible call to btrfs_del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); @@ -4453,7 +4648,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; free_extent_buffer(leaf); ret = 0; } else { @@ -4474,86 +4671,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* - * search the tree again to find a leaf with lesser keys - * returns 0 if it found something or 1 if there are no lesser leaves. - * returns < 0 on io errors. - * - * This may release the path, and so you may lose any locks held at the - * time you call it. - */ -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - struct btrfs_key key; - struct btrfs_key orig_key; - struct btrfs_disk_key found_key; - int ret; - - btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - orig_key = key; - - if (key.offset > 0) { - key.offset--; - } else if (key.type > 0) { - key.type--; - key.offset = (u64)-1; - } else if (key.objectid > 0) { - key.objectid--; - key.type = (u8)-1; - key.offset = (u64)-1; - } else { - return 1; - } - - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret <= 0) - return ret; - - /* - * Previous key not found. Even if we were at slot 0 of the leaf we had - * before releasing the path and calling btrfs_search_slot(), we now may - * be in a slot pointing to the same original key - this can happen if - * after we released the path, one of more items were moved from a - * sibling leaf into the front of the leaf we had due to an insertion - * (see push_leaf_right()). - * If we hit this case and our slot is > 0 and just decrement the slot - * so that the caller does not process the same key again, which may or - * may not break the caller, depending on its logic. - */ - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { - btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); - ret = comp_keys(&found_key, &orig_key); - if (ret == 0) { - if (path->slots[0] > 0) { - path->slots[0]--; - return 0; - } - /* - * At slot 0, same key as before, it means orig_key is - * the lowest, leftmost, key in the tree. We're done. - */ - return 1; - } - } - - btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); - /* - * We might have had an item with the previous key in the tree right - * before we released our path. And after we released our path, that - * item might have been pushed to the first slot (0) of the leaf we - * were holding due to a tree balance. Alternatively, an item with the - * previous key can exist as the only element of a leaf (big fat item). - * Therefore account for these 2 cases, so that our callers (like - * btrfs_previous_item) don't miss an existing item with a key matching - * the previous key we computed above. - */ - if (ret <= 0) - return 0; - return 1; -} - -/* * A helper function to walk down the tree starting at min_key, and looking * for nodes or leaves that are have a minimum transaction id. * This is used by the btree defrag code, and tree logging diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4c1986cd5bed..f2d2b313bde5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,6 +541,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, @@ -633,7 +635,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, return btrfs_insert_empty_items(trans, root, path, &batch); } -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); @@ -686,7 +687,7 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_leaf_free_space(struct extent_buffer *leaf); +int btrfs_leaf_free_space(const struct extent_buffer *leaf); static inline int is_fstree(u64 rootid) { @@ -702,6 +703,7 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } +u16 btrfs_csum_type_size(u16 type); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 8065341d831a..f2ff4cbe8656 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1040,7 +1040,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, clear_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached_state); - set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); + set_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0b32432d7d56..6a13cf00218b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -407,7 +407,6 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); - ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); } @@ -507,6 +506,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( { struct btrfs_delayed_ref_head *head; + lockdep_assert_held(&delayed_refs->lock); again: head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, true); @@ -531,7 +531,7 @@ again: href_node); } - head->processing = 1; + head->processing = true; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + @@ -549,31 +549,35 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); delayed_refs->num_heads--; - if (head->processing == 0) + if (!head->processing) delayed_refs->num_heads_ready--; } /* * Helper to insert the ref_node to the tail or merge with tail. * - * Return 0 for insert. - * Return >0 for merge. + * Return false if the ref was inserted. + * Return true if the ref was merged into an existing one (and therefore can be + * freed by the caller). */ -static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, - struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *ref) +static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_node *exist; int mod; - int ret = 0; spin_lock(&href->lock); exist = tree_insert(&href->ref_tree, ref); - if (!exist) - goto inserted; + if (!exist) { + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&ref->add_list, &href->ref_add_list); + atomic_inc(&root->num_entries); + spin_unlock(&href->lock); + return false; + } /* Now we are sure we can merge */ - ret = 1; if (exist->action == ref->action) { mod = ref->ref_mod; } else { @@ -600,13 +604,7 @@ static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, if (exist->ref_mod == 0) drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); - return ret; -inserted: - if (ref->action == BTRFS_ADD_DELAYED_REF) - list_add_tail(&ref->add_list, &href->ref_add_list); - atomic_inc(&root->num_entries); - spin_unlock(&href->lock); - return ret; + return true; } /* @@ -699,34 +697,38 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, bool is_system) { int count_mod = 1; - int must_insert_reserved = 0; + bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ BUG_ON(!is_data && reserved); - /* - * The head node stores the sum of all the mods, so dropping a ref - * should drop the sum in the head node by one. - */ - if (action == BTRFS_UPDATE_DELAYED_HEAD) + switch (action) { + case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; - else if (action == BTRFS_DROP_DELAYED_REF) + break; + case BTRFS_DROP_DELAYED_REF: + /* + * The head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ count_mod = -1; - - /* - * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved - * accounting when the extent is finally added, or if a later - * modification deletes the delayed ref without ever inserting the - * extent into the extent allocation tree. ref->must_insert_reserved - * is the flag used to record that accounting mods are required. - * - * Once we record must_insert_reserved, switch the action to - * BTRFS_ADD_DELAYED_REF because other special casing is not required. - */ - if (action == BTRFS_ADD_DELAYED_EXTENT) - must_insert_reserved = 1; - else - must_insert_reserved = 0; + break; + case BTRFS_ADD_DELAYED_EXTENT: + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update the + * reserved accounting when the extent is finally added, or if a + * later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record that + * accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not + * required. + */ + must_insert_reserved = true; + break; + } refcount_set(&head_ref->refs, 1); head_ref->bytenr = bytenr; @@ -738,7 +740,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); - head_ref->processing = 0; + head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -763,11 +765,11 @@ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, - int action, int *qrecord_inserted_ret) + int action, bool *qrecord_inserted_ret) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; - int qrecord_inserted = 0; + bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; @@ -777,7 +779,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, delayed_refs, qrecord)) kfree(qrecord); else - qrecord_inserted = 1; + qrecord_inserted = true; } trace_add_delayed_ref_head(trans->fs_info, head_ref, action); @@ -853,8 +855,6 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, ref->num_bytes = num_bytes; ref->ref_mod = 1; ref->action = action; - ref->is_head = 0; - ref->in_tree = 1; ref->seq = seq; ref->type = ref_type; RB_CLEAR_NODE(&ref->ref_node); @@ -875,11 +875,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - int qrecord_inserted; + bool qrecord_inserted; bool is_system; + bool merged; int action = generic_ref->action; int level = generic_ref->tree_ref.level; - int ret; u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; @@ -935,7 +935,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -947,7 +947,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, trace_add_delayed_tree_ref(fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); - if (ret > 0) + if (merged) kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); if (qrecord_inserted) @@ -968,9 +968,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - int qrecord_inserted; + bool qrecord_inserted; int action = generic_ref->action; - int ret; + bool merged; u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; @@ -1027,7 +1027,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1039,7 +1039,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); - if (ret > 0) + if (merged) kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b54261fe509b..b8e14b0ba5f1 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -48,9 +48,6 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; - /* is this node still in the rbtree? */ - unsigned int is_head:1; - unsigned int in_tree:1; }; struct btrfs_delayed_extent_op { @@ -70,20 +67,26 @@ struct btrfs_delayed_extent_op { struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; - refcount_t refs; + /* + * For insertion into struct btrfs_delayed_ref_root::href_root. + * Keep it in the same cache line as 'bytenr' for more efficient + * searches in the rbtree. + */ + struct rb_node href_node; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ struct mutex mutex; + refcount_t refs; + + /* Protects 'ref_tree' and 'ref_add_list'. */ spinlock_t lock; struct rb_root_cached ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; - struct rb_node href_node; - struct btrfs_delayed_extent_op *extent_op; /* @@ -113,10 +116,10 @@ struct btrfs_delayed_ref_head { * we need to update the in ram accounting to properly reflect * the free has happened. */ - unsigned int must_insert_reserved:1; - unsigned int is_data:1; - unsigned int is_system:1; - unsigned int processing:1; + bool must_insert_reserved; + bool is_data; + bool is_system; + bool processing; }; struct btrfs_delayed_tree_ref { @@ -337,7 +340,7 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { WARN_ON(refcount_read(&ref->refs) == 0); if (refcount_dec_and_test(&ref->refs)) { - WARN_ON(ref->in_tree); + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); switch (ref->type) { case BTRFS_TREE_BLOCK_REF_KEY: case BTRFS_SHARED_BLOCK_REF_KEY: diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 78696d331639..5e86bea0a950 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -41,7 +41,7 @@ * All new writes will be written to both target and source devices, so even * if replace gets canceled, sources device still contains up-to-date data. * - * Location: handle_ops_on_dev_replace() from __btrfs_map_block() + * Location: handle_ops_on_dev_replace() from btrfs_map_block() * Start: btrfs_dev_replace_start() * End: btrfs_dev_replace_finishing() * Content: Latest data/metadata @@ -795,8 +795,8 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, while (!find_first_extent_bit(&srcdev->alloc_state, start, &found_start, &found_end, CHUNK_ALLOCATED, &cached_state)) { - ret = set_extent_bits(&tgtdev->alloc_state, found_start, - found_end, CHUNK_ALLOCATED); + ret = set_extent_bit(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED, NULL); if (ret) break; start = found_end + 1; diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index a6d77fe41e1a..944a7340f6a4 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -73,6 +73,23 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, return &discard_ctl->discard_list[block_group->discard_index]; } +/* + * Determine if async discard should be running. + * + * @discard_ctl: discard control + * + * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + */ +static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_fs_info *fs_info = container_of(discard_ctl, + struct btrfs_fs_info, + discard_ctl); + + return (!(fs_info->sb->s_flags & SB_RDONLY) && + test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); +} + static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { @@ -545,23 +562,6 @@ static void btrfs_discard_workfn(struct work_struct *work) } /* - * Determine if async discard should be running. - * - * @discard_ctl: discard control - * - * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. - */ -bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) -{ - struct btrfs_fs_info *fs_info = container_of(discard_ctl, - struct btrfs_fs_info, - discard_ctl); - - return (!(fs_info->sb->s_flags & SB_RDONLY) && - test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); -} - -/* * Recalculate the base delay. * * @discard_ctl: discard control diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index 57b9202f427f..dddb0f9101ba 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -24,7 +24,6 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group); void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, bool override); -bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl); /* Update operations */ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dabc79c1af1b..7513388b0567 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -60,15 +60,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static void btrfs_destroy_ordered_extents(struct btrfs_root *root); -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info); -static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); -static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages, - int mark); -static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *pinned_extents); static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); @@ -110,35 +101,27 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) * detect blocks that either didn't get written at all or got written * in the wrong place. */ -static int verify_parent_transid(struct extent_io_tree *io_tree, - struct extent_buffer *eb, u64 parent_transid, - int atomic) +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) { - struct extent_state *cached_state = NULL; - int ret; + if (!extent_buffer_uptodate(eb)) + return 0; if (!parent_transid || btrfs_header_generation(eb) == parent_transid) - return 0; + return 1; if (atomic) return -EAGAIN; - lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); - if (extent_buffer_uptodate(eb) && - btrfs_header_generation(eb) == parent_transid) { - ret = 0; - goto out; - } - btrfs_err_rl(eb->fs_info, + if (!extent_buffer_uptodate(eb) || + btrfs_header_generation(eb) != parent_transid) { + btrfs_err_rl(eb->fs_info, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); - ret = 1; - clear_extent_buffer_uptodate(eb); -out: - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - return ret; + clear_extent_buffer_uptodate(eb); + return 0; + } + return 1; } static bool btrfs_supported_super_csum(u16 csum_type) @@ -180,64 +163,6 @@ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - int found_level; - struct btrfs_key found_key; - int ret; - - found_level = btrfs_header_level(eb); - if (found_level != level) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree level check failed\n"); - btrfs_err(fs_info, -"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", - eb->start, level, found_level); - return -EIO; - } - - if (!first_key) - return 0; - - /* - * For live tree block (new tree blocks in current transaction), - * we need proper lock context to avoid race, which is impossible here. - * So we only checks tree blocks which is read from disk, whose - * generation <= fs_info->last_trans_committed. - */ - if (btrfs_header_generation(eb) > fs_info->last_trans_committed) - return 0; - - /* We have @first_key, so this @eb must have at least one item */ - if (btrfs_header_nritems(eb) == 0) { - btrfs_err(fs_info, - "invalid tree nritems, bytenr=%llu nritems=0 expect >0", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return -EUCLEAN; - } - - if (found_level) - btrfs_node_key_to_cpu(eb, &found_key, 0); - else - btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(first_key, &found_key); - - if (ret) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree first key check failed\n"); - btrfs_err(fs_info, -"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", - eb->start, parent_transid, first_key->objectid, - first_key->type, first_key->offset, - found_key.objectid, found_key.type, - found_key.offset); - } - return ret; -} - static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { @@ -312,12 +237,34 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, return ret; } -static int csum_one_extent_buffer(struct extent_buffer *eb) +/* + * Checksum a dirty tree block before IO. + */ +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) { + struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; + u64 found_start = btrfs_header_bytenr(eb); u8 result[BTRFS_CSUM_SIZE]; int ret; + /* Btree blocks are always contiguous on disk. */ + if (WARN_ON_ONCE(bbio->file_offset != eb->start)) + return BLK_STS_IOERR; + if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) + return BLK_STS_IOERR; + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON_ONCE(found_start != 0); + return BLK_STS_OK; + } + + if (WARN_ON_ONCE(found_start != eb->start)) + return BLK_STS_IOERR; + if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start, + eb->len))) + return BLK_STS_IOERR; + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE) == 0); @@ -326,7 +273,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); else - ret = btrfs_check_leaf_full(eb); + ret = btrfs_check_leaf(eb); if (ret < 0) goto error; @@ -344,8 +291,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); - - return 0; + return BLK_STS_OK; error: btrfs_print_tree(eb, 0); @@ -359,103 +305,10 @@ error: */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); - return ret; -} - -/* Checksum all dirty extent buffers in one bio_vec */ -static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, - struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - u64 bvec_start = page_offset(page) + bvec->bv_offset; - u64 cur; - int ret = 0; - - for (cur = bvec_start; cur < bvec_start + bvec->bv_len; - cur += fs_info->nodesize) { - struct extent_buffer *eb; - bool uptodate; - - eb = find_extent_buffer(fs_info, cur); - uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, - fs_info->nodesize); - - /* A dirty eb shouldn't disappear from buffer_radix */ - if (WARN_ON(!eb)) - return -EUCLEAN; - - if (WARN_ON(cur != btrfs_header_bytenr(eb))) { - free_extent_buffer(eb); - return -EUCLEAN; - } - if (WARN_ON(!uptodate)) { - free_extent_buffer(eb); - return -EUCLEAN; - } - - ret = csum_one_extent_buffer(eb); - free_extent_buffer(eb); - if (ret < 0) - return ret; - } - return ret; -} - -/* - * Checksum a dirty tree block before IO. This has extra checks to make sure - * we only fill in the checksum field in the first page of a multi-page block. - * For subpage extent buffers we need bvec to also read the offset in the page. - */ -static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - u64 start = page_offset(page); - u64 found_start; - struct extent_buffer *eb; - - if (fs_info->nodesize < PAGE_SIZE) - return csum_dirty_subpage_buffers(fs_info, bvec); - - eb = (struct extent_buffer *)page->private; - if (page != eb->pages[0]) - return 0; - - found_start = btrfs_header_bytenr(eb); - - if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { - WARN_ON(found_start != 0); - return 0; - } - - /* - * Please do not consolidate these warnings into a single if. - * It is useful to know what went wrong. - */ - if (WARN_ON(found_start != start)) - return -EUCLEAN; - if (WARN_ON(!PageUptodate(page))) - return -EUCLEAN; - - return csum_one_extent_buffer(eb); -} - -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) -{ - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - struct bvec_iter iter; - struct bio_vec bv; - int ret = 0; - - bio_for_each_segment(bv, &bbio->bio, iter) { - ret = csum_dirty_buffer(fs_info, &bv); - if (ret) - break; - } - return errno_to_blk_status(ret); } -static int check_tree_block_fsid(struct extent_buffer *eb) +static bool check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; @@ -475,18 +328,18 @@ static int check_tree_block_fsid(struct extent_buffer *eb) metadata_uuid = fs_devices->fsid; if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) - return 0; + return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) - return 0; + return false; - return 1; + return true; } /* Do basic extent buffer checks at read time */ -static int validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) +int btrfs_validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -583,7 +436,7 @@ static int validate_extent_buffer(struct extent_buffer *eb, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf_full(eb)) { + if (found_level == 0 && btrfs_check_leaf(eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } @@ -591,9 +444,7 @@ static int validate_extent_buffer(struct extent_buffer *eb, if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; - if (!ret) - set_extent_buffer_uptodate(eb); - else + if (ret) btrfs_err(fs_info, "read time tree block corruption detected on logical %llu mirror %u", eb->start, eb->read_mirror); @@ -601,105 +452,6 @@ out: return ret; } -static int validate_subpage_buffer(struct page *page, u64 start, u64 end, - int mirror, struct btrfs_tree_parent_check *check) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - struct extent_buffer *eb; - bool reads_done; - int ret = 0; - - ASSERT(check); - - /* - * We don't allow bio merge for subpage metadata read, so we should - * only get one eb for each endio hook. - */ - ASSERT(end == start + fs_info->nodesize - 1); - ASSERT(PagePrivate(page)); - - eb = find_extent_buffer(fs_info, start); - /* - * When we are reading one tree block, eb must have been inserted into - * the radix tree. If not, something is wrong. - */ - ASSERT(eb); - - reads_done = atomic_dec_and_test(&eb->io_pages); - /* Subpage read must finish in page read */ - ASSERT(reads_done); - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } - ret = validate_extent_buffer(eb, check); - if (ret < 0) - goto err; - - set_extent_buffer_uptodate(eb); - - free_extent_buffer(eb); - return ret; -err: - /* - * end_bio_extent_readpage decrements io_pages in case of error, - * make sure it has something to decrement. - */ - atomic_inc(&eb->io_pages); - clear_extent_buffer_uptodate(eb); - free_extent_buffer(eb); - return ret; -} - -int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, - struct page *page, u64 start, u64 end, - int mirror) -{ - struct extent_buffer *eb; - int ret = 0; - int reads_done; - - ASSERT(page->private); - - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return validate_subpage_buffer(page, start, end, mirror, - &bbio->parent_check); - - eb = (struct extent_buffer *)page->private; - - /* - * The pending IO might have been the only thing that kept this buffer - * in memory. Make sure we have a ref for all this other checks - */ - atomic_inc(&eb->refs); - - reads_done = atomic_dec_and_test(&eb->io_pages); - if (!reads_done) - goto err; - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } - ret = validate_extent_buffer(eb, &bbio->parent_check); -err: - if (ret) { - /* - * our io error hook is going to dec the io pages - * again, we have to make sure it has something - * to decrement - */ - atomic_inc(&eb->io_pages); - clear_extent_buffer_uptodate(eb); - } - free_extent_buffer(eb); - - return ret; -} - #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1396,8 +1148,7 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); - if (root) - root = btrfs_grab_root(root); + root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1411,31 +1162,28 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, .offset = 0, }; - if (objectid == BTRFS_ROOT_TREE_OBJECTID) + switch (objectid) { + case BTRFS_ROOT_TREE_OBJECTID: return btrfs_grab_root(fs_info->tree_root); - if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + case BTRFS_EXTENT_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); - if (objectid == BTRFS_CHUNK_TREE_OBJECTID) + case BTRFS_CHUNK_TREE_OBJECTID: return btrfs_grab_root(fs_info->chunk_root); - if (objectid == BTRFS_DEV_TREE_OBJECTID) + case BTRFS_DEV_TREE_OBJECTID: return btrfs_grab_root(fs_info->dev_root); - if (objectid == BTRFS_CSUM_TREE_OBJECTID) + case BTRFS_CSUM_TREE_OBJECTID: + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_QUOTA_TREE_OBJECTID: + return btrfs_grab_root(fs_info->quota_root); + case BTRFS_UUID_TREE_OBJECTID: + return btrfs_grab_root(fs_info->uuid_root); + case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + return btrfs_grab_root(fs_info->block_group_root); + case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); - if (objectid == BTRFS_QUOTA_TREE_OBJECTID) - return btrfs_grab_root(fs_info->quota_root) ? - fs_info->quota_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_UUID_TREE_OBJECTID) - return btrfs_grab_root(fs_info->uuid_root) ? - fs_info->uuid_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID) - return btrfs_grab_root(fs_info->block_group_root) ? - fs_info->block_group_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { - struct btrfs_root *root = btrfs_global_root(fs_info, &key); - - return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT); - } - return NULL; + default: + return NULL; + } } int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, @@ -1991,7 +1739,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); - btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); @@ -2183,12 +1930,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); - fs_info->hipri_workers = - btrfs_alloc_workqueue(fs_info, "worker-high", - flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = btrfs_alloc_workqueue(fs_info, "delalloc", @@ -2202,7 +1947,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); fs_info->fixup_workers = - btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); + btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags); fs_info->endio_workers = alloc_workqueue("btrfs-endio", flags, max_active); @@ -2221,11 +1966,12 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); + btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", + ordered_flags); fs_info->discard_ctl.discard_workers = - alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); + alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); - if (!(fs_info->workers && fs_info->hipri_workers && + if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->compressed_write_workers && @@ -2265,6 +2011,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; + case BTRFS_CSUM_TYPE_XXHASH: + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; default: break; } @@ -2642,6 +2391,14 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, + "dev_item UUID does not match metadata fsid: %pU != %pU", + fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); + ret = -EINVAL; + } + /* * Artificial requirement for block-group-tree to force newer features * (free-space-tree, no-holes) so the test matrix is smaller. @@ -2654,14 +2411,6 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, - BTRFS_FSID_SIZE) != 0) { - btrfs_err(fs_info, - "dev_item UUID does not match metadata fsid: %pU != %pU", - fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); - ret = -EINVAL; - } - /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later @@ -4662,28 +4411,10 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_close_devices(fs_info->fs_devices); } -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic) -{ - int ret; - struct inode *btree_inode = buf->pages[0]->mapping->host; - - ret = extent_buffer_uptodate(buf); - if (!ret) - return ret; - - ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, - parent_transid, atomic); - if (ret == -EAGAIN) - return ret; - return !ret; -} - void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); - int was_dirty; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* @@ -4698,19 +4429,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", buf->start, transid, fs_info->generation); - was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - buf->len, - fs_info->dirty_metadata_batch); + set_extent_buffer_dirty(buf); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* - * Since btrfs_mark_buffer_dirty() can be called with item pointer set - * but item data not updated. - * So here we should only check item pointers, not item data. + * btrfs_check_leaf() won't check item data if we don't have WRITTEN + * set, so this will only validate the basic structure of the items. */ - if (btrfs_header_level(buf) == 0 && - btrfs_check_leaf_relaxed(buf)) { + if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -4840,13 +4565,12 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) +static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; - int ret = 0; delayed_refs = &trans->delayed_refs; @@ -4854,7 +4578,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); btrfs_debug(fs_info, "delayed_refs has NO entry"); - return ret; + return; } while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { @@ -4871,7 +4595,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((n = rb_first_cached(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); - ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) @@ -4916,8 +4639,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); - - return ret; } static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) @@ -5142,8 +4863,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); - btrfs_free_redirty_list(cur_trans); - cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4d5772330110..b03767f4d7ed 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -31,8 +31,6 @@ struct btrfs_tree_parent_check; void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( @@ -84,9 +82,8 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, - struct page *page, u64 start, u64 end, - int mirror); +int btrfs_validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 29a225836e28..a2315a4b8b75 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -533,6 +533,16 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, } /* + * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly, + * unset the EXTENT_NOWAIT bit. + */ +static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) +{ + *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS); + *bits &= EXTENT_NOWAIT - 1; +} + +/* * Clear some bits on a range in the tree. This may require splitting or * inserting elements in the tree, so the gfp mask is used to indicate which * allocations or sleeping are allowed. @@ -546,7 +556,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, */ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -556,7 +566,9 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear = 0; int wake; int delete = (bits & EXTENT_CLEAR_ALL_BITS); + gfp_t mask; + set_gfp_mask_from_bits(&bits, &mask); btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); @@ -953,7 +965,8 @@ out: /* * Set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. + * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for + * GFP_NOWAIT. * * If any of the exclusive bits are set, this will fail with -EEXIST if some * part of the range already has the desired bits set. The extent_state of the @@ -968,7 +981,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u64 *failed_start, struct extent_state **failed_state, struct extent_state **cached_state, - struct extent_changeset *changeset, gfp_t mask) + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -978,7 +991,9 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; u32 exclusive_bits = (bits & EXTENT_LOCKED); + gfp_t mask; + set_gfp_mask_from_bits(&bits, &mask); btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); @@ -1188,10 +1203,10 @@ out: } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, gfp_t mask) + u32 bits, struct extent_state **cached_state) { return __set_extent_bit(tree, start, end, bits, NULL, NULL, - cached_state, NULL, mask); + cached_state, NULL); } /* @@ -1687,8 +1702,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, - changeset, GFP_NOFS); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1700,8 +1714,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS, - changeset); + return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, @@ -1711,7 +1724,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - NULL, cached, NULL, GFP_NOFS); + NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -1733,7 +1746,7 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - &failed_state, cached_state, NULL, GFP_NOFS); + &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, @@ -1743,7 +1756,7 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, &failed_state); err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, &failed_state, - cached_state, NULL, GFP_NOFS); + cached_state, NULL); } return err; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 21766e49ec02..fbd3b275ab1c 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -43,6 +43,15 @@ enum { * want the extent states to go away. */ ENUM_BIT(EXTENT_CLEAR_ALL_BITS), + + /* + * This must be last. + * + * Bit not representing a state but a request for NOWAIT semantics, + * e.g. when allocating memory, and must be masked out from the other + * bits. + */ + ENUM_BIT(EXTENT_NOWAIT) }; #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ @@ -127,22 +136,20 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached, gfp_t mask, + u32 bits, struct extent_state **cached, struct extent_changeset *changeset); static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, bits, cached, - GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, bits, cached, NULL); } static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, - GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, @@ -154,31 +161,13 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, gfp_t mask); - -static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) -{ - return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT); -} - -static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) -{ - return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS); -} + u32 bits, struct extent_state **cached_state); static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, GFP_NOFS, NULL); -} - -static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask); + cached_state, NULL); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -193,29 +182,6 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u32 clear_bits, struct extent_state **cached_state); -static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, u32 extra_bits, - struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | extra_bits, - cached_state, GFP_NOFS); -} - -static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DEFRAG, - cached_state, GFP_NOFS); -} - -static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); -} - int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5cd289de4e92..911908ea5f6f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -73,8 +73,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); + set_extent_bit(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE, NULL); return 0; } @@ -402,7 +402,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } - btrfs_print_leaf((struct extent_buffer *)eb); + btrfs_print_leaf(eb); btrfs_err(eb->fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); @@ -1164,15 +1164,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * should not happen at all. */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, -"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", - bytenr, num_bytes, root_objectid); - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - WARN_ON(1); - btrfs_crit(trans->fs_info, - "path->slots[0]=%d path->nodes[0]:", path->slots[0]); - btrfs_print_leaf(path->nodes[0]); - } +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", + bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } update_inline_extent_backref(path, iref, refs_to_add, extent_op); @@ -1208,11 +1203,11 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << 9); + u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); if (WARN_ON(start != aligned_start)) { len -= aligned_start - start; - len = round_down(len, 1 << 9); + len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; } @@ -1250,7 +1245,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, } if (size) { - ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + size >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += size; @@ -1267,7 +1263,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, } if (bytes_left) { - ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + bytes_left >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; @@ -1500,7 +1497,7 @@ out: static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; struct btrfs_delayed_data_ref *ref; @@ -1650,7 +1647,7 @@ out: static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; struct btrfs_delayed_tree_ref *ref; @@ -1690,7 +1687,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; @@ -1748,7 +1745,7 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref struct btrfs_delayed_ref_head *head) { spin_lock(&delayed_refs->lock); - head->processing = 0; + head->processing = false; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); btrfs_delayed_ref_unlock(head); @@ -1900,7 +1897,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_extent_op *extent_op; struct btrfs_delayed_ref_node *ref; - int must_insert_reserved = 0; + bool must_insert_reserved; int ret; delayed_refs = &trans->transaction->delayed_refs; @@ -1916,7 +1913,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, return -EAGAIN; } - ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) @@ -1943,7 +1939,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; + locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; @@ -2155,10 +2151,10 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level) + struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; + int level = btrfs_header_level(eb); int ret; extent_op = btrfs_alloc_delayed_extent_op(); @@ -2510,8 +2506,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_dirty(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } @@ -2838,6 +2834,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } +#define abort_and_dump(trans, path, fmt, args...) \ +({ \ + btrfs_abort_transaction(trans, -EUCLEAN); \ + btrfs_print_leaf(path->nodes[0]); \ + btrfs_crit(trans->fs_info, fmt, ##args); \ +}) + /* * Drop one or more refs of @node. * @@ -2978,10 +2981,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!found_extent) { if (iref) { - btrfs_crit(info, -"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", + path->slots[0]); + ret = -EUCLEAN; + goto out; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, @@ -3029,11 +3033,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (ret) { - btrfs_err(info, - "umm, got %d back from search, was looking for %llu", - ret, bytenr); if (ret > 0) btrfs_print_leaf(path->nodes[0]); + btrfs_err(info, + "umm, got %d back from search, was looking for %llu, slot %d", + ret, bytenr, path->slots[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3042,12 +3046,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { - btrfs_print_leaf(path->nodes[0]); - btrfs_err(info, - "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - bytenr, parent, root_objectid, owner_objectid, - owner_offset); - btrfs_abort_transaction(trans, ret); + abort_and_dump(trans, path, +"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", + bytenr, parent, root_objectid, owner_objectid, + owner_offset, path->slots[0]); goto out; } else { btrfs_abort_transaction(trans, ret); @@ -3067,14 +3069,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; + if (item_size < sizeof(*ei) + sizeof(*bi)) { - btrfs_crit(info, -"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu", - key.objectid, key.type, key.offset, - owner_objectid, item_size, - sizeof(*ei) + sizeof(*bi)); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", + key.objectid, key.type, key.offset, + path->slots[0], owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + ret = -EUCLEAN; + goto out; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); @@ -3082,11 +3085,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_crit(info, - "trying to drop %d refs but we only have %llu for bytenr %llu", - refs_to_drop, refs, bytenr); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", + refs_to_drop, refs, bytenr, path->slots[0]); + ret = -EUCLEAN; + goto out; } refs -= refs_to_drop; @@ -3099,10 +3102,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, */ if (iref) { if (!found_extent) { - btrfs_crit(info, -"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", + path->slots[0]); + ret = -EUCLEAN; + goto out; } } else { btrfs_set_extent_refs(leaf, ei, refs); @@ -3121,21 +3125,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { if (is_data && refs_to_drop != extent_data_ref_count(path, iref)) { - btrfs_crit(info, - "invalid refs_to_drop, current refs %u refs_to_drop %u", - extent_data_ref_count(path, iref), - refs_to_drop); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", + extent_data_ref_count(path, iref), + refs_to_drop, path->slots[0]); + ret = -EUCLEAN; + goto out; } if (iref) { if (path->slots[0] != extent_slot) { - btrfs_crit(info, -"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", - key.objectid, key.type, - key.offset); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", + key.objectid, key.type, + key.offset, path->slots[0]); + ret = -EUCLEAN; + goto out; } } else { /* @@ -3145,10 +3149,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ if (path->slots[0] != extent_slot + 1) { - btrfs_crit(info, - "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", + path->slots[0]); + ret = -EUCLEAN; + goto out; } path->slots[0] = extent_slot; num_to_del = 2; @@ -3170,19 +3175,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; -err_dump: - /* - * Leaf dump can take up a lot of log buffer, so we only do full leaf - * dump for debug build. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", - path->slots[0], extent_slot); - btrfs_print_leaf(path->nodes[0]); - } - - btrfs_free_path(path); - return -EUCLEAN; } /* @@ -3219,7 +3211,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out; btrfs_delete_ref_head(delayed_refs, head); - head->processing = 0; + head->processing = false; spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -4804,7 +4796,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) lockdep_owner = BTRFS_FS_TREE_OBJECTID; - /* btrfs_clean_tree_block() accesses generation field. */ + /* btrfs_clear_buffer_dirty() accesses generation field. */ btrfs_set_header_generation(buf, trans->transid); /* @@ -4836,15 +4828,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY, NULL); else - set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1); + set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_NEW, NULL); } else { buf->log_index = -1; - set_extent_dirty(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -5102,8 +5096,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb)); + ret = btrfs_set_disk_extent_flags(trans, eb, flag); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -5985,9 +5978,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bits(&device->alloc_state, start, - start + bytes - 1, - CHUNK_TRIMMED); + set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 0c958fc1b3b8..429d5c570061 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -141,7 +141,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, int level); + struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a1adadd5d25d..a91d5ad27984 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -98,33 +98,16 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) */ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; - int mirror_num; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; - - /* - * This is for metadata read, to provide the extra needed verification - * info. This has to be provided for submit_one_bio(), as - * submit_one_bio() can submit a bio if it ends at stripe boundary. If - * no such parent_check is provided, the metadata can hit false alert at - * endio time. - */ - struct btrfs_tree_parent_check *parent_check; - - /* - * Tell writepage not to lock the state bits for this range, it still - * does the unlocking. - */ - bool extent_locked; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; - int mirror_num = bio_ctrl->mirror_num; if (!bbio) return; @@ -132,25 +115,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); - if (!is_data_inode(&bbio->inode->vfs_inode)) { - if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) { - /* - * For metadata read, we should have the parent_check, - * and copy it to bbio for metadata verification. - */ - ASSERT(bio_ctrl->parent_check); - memcpy(&bbio->parent_check, - bio_ctrl->parent_check, - sizeof(struct btrfs_tree_parent_check)); - } - bbio->bio.bi_opf |= REQ_META; - } - if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) - btrfs_submit_compressed_read(bbio, mirror_num); + btrfs_submit_compressed_read(bbio); else - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -248,8 +217,6 @@ static int process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_SET_ORDERED) btrfs_page_clamp_set_ordered(fs_info, page, start, len); - if (page_ops & PAGE_SET_ERROR) - btrfs_page_clamp_set_error(fs_info, page, start, len); if (page_ops & PAGE_START_WRITEBACK) { btrfs_page_clamp_clear_dirty(fs_info, page, start, len); btrfs_page_clamp_set_writeback(fs_info, page, start, len); @@ -295,9 +262,6 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) - mapping_set_error(mapping, -EIO); - folio_batch_init(&fbatch); while (index <= end_index) { int found_folios; @@ -506,6 +470,15 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops, NULL); } +static bool btrfs_verify_page(struct page *page, u64 start) +{ + if (!fsverity_active(page->mapping->host) || + PageUptodate(page) || + start >= i_size_read(page->mapping->host)) + return true; + return fsverity_verify_page(page); +} + static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -513,20 +486,10 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) ASSERT(page_offset(page) <= start && start + len <= page_offset(page) + PAGE_SIZE); - if (uptodate) { - if (fsverity_active(page->mapping->host) && - !PageError(page) && - !PageUptodate(page) && - start < i_size_read(page->mapping->host) && - !fsverity_verify_page(page)) { - btrfs_page_set_error(fs_info, page, start, len); - } else { - btrfs_page_set_uptodate(fs_info, page, start, len); - } - } else { + if (uptodate && btrfs_verify_page(page, start)) + btrfs_page_set_uptodate(fs_info, page, start, len); + else btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); - } if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); @@ -554,7 +517,6 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) len = end + 1 - start; btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } @@ -574,8 +536,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; - u64 start; - u64 end; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -584,6 +544,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + u64 start = page_offset(page) + bvec->bv_offset; + u32 len = bvec->bv_len; /* Our read/write should always be sector aligned. */ if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) @@ -595,12 +557,12 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) "incomplete page write with offset %u and length %u", bvec->bv_offset, bvec->bv_len); - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - - end_extent_writepage(page, error, start, end); - - btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); + btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); + if (error) { + btrfs_page_clear_uptodate(fs_info, page, start, len); + mapping_set_error(page->mapping, error); + } + btrfs_page_clear_writeback(fs_info, page, start, len); } bio_put(bio); @@ -686,35 +648,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a givne bytenr. - * - * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking - * in endio context. - */ -static struct extent_buffer *find_extent_buffer_readpage( - struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) -{ - struct extent_buffer *eb; - - /* - * For regular sectorsize, we can use page->private to grab extent - * buffer - */ - if (fs_info->nodesize >= PAGE_SIZE) { - ASSERT(PagePrivate(page) && page->private); - return (struct extent_buffer *)page->private; - } - - /* For subpage case, we need to lookup buffer radix tree */ - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - bytenr >> fs_info->sectorsize_bits); - rcu_read_unlock(); - ASSERT(eb); - return eb; -} - -/* * after a readpage IO is done, we need to: * clear the uptodate bits on error * set the uptodate bits if things worked @@ -735,7 +668,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) * larger than UINT_MAX, u32 here is enough. */ u32 bio_offset = 0; - int mirror; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -775,11 +707,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) end = start + bvec->bv_len - 1; len = bvec->bv_len; - mirror = bbio->mirror_num; - if (uptodate && !is_data_inode(inode) && - btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) - uptodate = false; - if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; @@ -800,19 +727,12 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } - } else if (!is_data_inode(inode)) { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); } /* Update page status and unlock. */ end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); + start, end, uptodate); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -906,13 +826,8 @@ static void alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; - /* - * Limit the extent to the ordered boundary for Zone Append. - * Compressed bios aren't submitted directly, so it doesn't apply to - * them. - */ - if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && - btrfs_use_zone_append(bbio)) { + /* Limit data write bios to the ordered boundary. */ + if (bio_ctrl->wbc) { struct btrfs_ordered_extent *ordered; ordered = btrfs_lookup_ordered_extent(inode, file_offset); @@ -920,11 +835,9 @@ static void alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, ordered->file_offset + ordered->disk_num_bytes - file_offset); - btrfs_put_ordered_extent(ordered); + bbio->ordered = ordered; } - } - if (bio_ctrl->wbc) { /* * Pick the last added device to support cgroup writeback. For * multi-device file systems this means blk-cgroup policies have @@ -1125,7 +1038,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { unlock_extent(tree, start, end, NULL); - btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); return ret; } @@ -1329,11 +1241,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, &nr_written, wbc); - if (ret) { - btrfs_page_set_error(inode->root->fs_info, page, - page_offset(page), PAGE_SIZE); + if (ret) return ret; - } + /* * delalloc_end is already one less than the total length, so * we don't subtract one from PAGE_SIZE @@ -1438,7 +1348,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct extent_map *em; int ret = 0; int nr = 0; - bool compressed; ret = btrfs_writepage_cow_fixup(page); if (ret) { @@ -1448,12 +1357,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, return 1; } - /* - * we don't want to touch the inode after unlocking the page, - * so we update the mapping writeback index now - */ - bio_ctrl->wbc->nr_to_write--; - bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; @@ -1486,7 +1389,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR(em)) { - btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); goto out_error; } @@ -1497,10 +1399,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, ASSERT(cur < end); ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); + block_start = em->block_start; - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); disk_bytenr = em->block_start + extent_offset; + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + /* * Note that em_end from extent_map_end() and dirty_range_end from * find_next_dirty_byte() are all exclusive @@ -1509,22 +1415,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, free_extent_map(em); em = NULL; - /* - * compressed and inline extents are written through other - * paths in the FS - */ - if (compressed || block_start == EXTENT_MAP_HOLE || - block_start == EXTENT_MAP_INLINE) { - if (compressed) - nr++; - else - btrfs_writepage_endio_finish_ordered(inode, - page, cur, cur + iosize - 1, true); - btrfs_page_clear_dirty(fs_info, page, cur, iosize); - cur += iosize; - continue; - } - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(inode->root->fs_info, @@ -1572,7 +1462,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); const u64 page_end = page_start + PAGE_SIZE - 1; int ret; @@ -1585,9 +1474,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl WARN_ON(!PageLocked(page)); - btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, - page_offset(page), PAGE_SIZE); - pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { @@ -1600,77 +1486,30 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); ret = set_page_extent_mapped(page); - if (ret < 0) { - SetPageError(page); + if (ret < 0) goto done; - } - if (!bio_ctrl->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); - if (ret == 1) - return 0; - if (ret) - goto done; - } + ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + if (ret == 1) + return 0; + if (ret) + goto done; ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); if (ret == 1) return 0; + bio_ctrl->wbc->nr_to_write--; + done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ set_page_writeback(page); end_page_writeback(page); } - /* - * Here we used to have a check for PageError() and then set @ret and - * call end_extent_writepage(). - * - * But in fact setting @ret here will cause different error paths - * between subpage and regular sectorsize. - * - * For regular page size, we never submit current page, but only add - * current page to current bio. - * The bio submission can only happen in next page. - * Thus if we hit the PageError() branch, @ret is already set to - * non-zero value and will not get updated for regular sectorsize. - * - * But for subpage case, it's possible we submit part of current page, - * thus can get PageError() set by submitted bio of the same page, - * while our @ret is still 0. - * - * So here we unify the behavior and don't set @ret. - * Error can still be properly passed to higher layer as page will - * be set error, here we just don't handle the IO failure. - * - * NOTE: This is just a hotfix for subpage. - * The root fix will be properly ending ordered extent when we hit - * an error during writeback. - * - * But that needs a bigger refactoring, as we not only need to grab the - * submitted OE, but also need to know exactly at which bytenr we hit - * the error. - * Currently the full page based __extent_writepage_io() is not - * capable of that. - */ - if (PageError(page)) + if (ret) end_extent_writepage(page, ret, page_start, page_end); - if (bio_ctrl->extent_locked) { - struct writeback_control *wbc = bio_ctrl->wbc; - - /* - * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), - * the page can either be locked by lock_page() or - * process_one_page(). - * Let btrfs_page_unlock_writer() handle both cases. - */ - ASSERT(wbc); - btrfs_page_unlock_writer(fs_info, page, wbc->range_start, - wbc->range_end + 1 - wbc->range_start); - } else { - unlock_page(page); - } + unlock_page(page); ASSERT(ret <= 0); return ret; } @@ -1681,52 +1520,26 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - /* * Lock extent buffer status and pages for writeback. * - * May try to flush write bio if we can't get the lock. - * - * Return 0 if the extent buffer doesn't need to be submitted. - * (E.g. the extent buffer is not dirty) - * Return >0 is the extent buffer is submitted to bio. - * Return <0 if something went wrong, no page is locked. + * Return %false if the extent buffer doesn't need to be submitted (e.g. the + * extent buffer is not dirty) + * Return %true is the extent buffer is submitted to bio. */ -static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) +static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, + struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages; - int flush = 0; - int ret = 0; + bool ret = false; - if (!btrfs_try_tree_write_lock(eb)) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - btrfs_tree_lock(eb); - } - - if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_lock(eb); + while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL) - return 0; - if (!flush) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - } - while (1) { - wait_on_extent_buffer_writeback(eb); - btrfs_tree_lock(eb); - if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) - break; - btrfs_tree_unlock(eb); - } + if (wbc->sync_mode != WB_SYNC_ALL) + return false; + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); } /* @@ -1742,45 +1555,19 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - ret = 1; + ret = true; } else { spin_unlock(&eb->refs_lock); } - btrfs_tree_unlock(eb); - - /* - * Either we don't need to submit any tree block, or we're submitting - * subpage eb. - * Subpage metadata doesn't use page locking at all, so we can skip - * the page locking. - */ - if (!ret || fs_info->nodesize < PAGE_SIZE) - return ret; - - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - if (!trylock_page(p)) { - if (!flush) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - } - lock_page(p); - } - } - return ret; } -static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) +static void set_btree_ioerr(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - btrfs_page_set_error(fs_info, page, eb->start, eb->len); - if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) - return; + set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); /* * A read may stumble upon this buffer later, make sure that it gets an @@ -1794,7 +1581,7 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) * return a 0 because we are readonly if we don't modify the err seq for * the superblock. */ - mapping_set_error(page->mapping, -EIO); + mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); /* * If writeback for a btree extent that doesn't belong to a log tree @@ -1869,101 +1656,34 @@ static struct extent_buffer *find_extent_buffer_nolock( return NULL; } -/* - * The endio function for subpage extent buffer write. - * - * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() - * after all extent buffers in the page has finished their writeback. - */ -static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) +static void extent_buffer_write_end_io(struct btrfs_bio *bbio) { - struct bio *bio = &bbio->bio; - struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct extent_buffer *eb = bbio->private; + struct btrfs_fs_info *fs_info = eb->fs_info; + bool uptodate = !bbio->bio.bi_status; struct bvec_iter_all iter_all; + struct bio_vec *bvec; + u32 bio_offset = 0; - fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->nodesize < PAGE_SIZE); + if (!uptodate) + set_btree_ioerr(eb); - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + u64 start = eb->start + bio_offset; struct page *page = bvec->bv_page; - u64 bvec_start = page_offset(page) + bvec->bv_offset; - u64 bvec_end = bvec_start + bvec->bv_len - 1; - u64 cur_bytenr = bvec_start; - - ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); - - /* Iterate through all extent buffers in the range */ - while (cur_bytenr <= bvec_end) { - struct extent_buffer *eb; - int done; - - /* - * Here we can't use find_extent_buffer(), as it may - * try to lock eb->refs_lock, which is not safe in endio - * context. - */ - eb = find_extent_buffer_nolock(fs_info, cur_bytenr); - ASSERT(eb); - - cur_bytenr = eb->start + eb->len; - - ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); - done = atomic_dec_and_test(&eb->io_pages); - ASSERT(done); - - if (bio->bi_status || - test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { - ClearPageUptodate(page); - set_btree_ioerr(page, eb); - } + u32 len = bvec->bv_len; - btrfs_subpage_clear_writeback(fs_info, page, eb->start, - eb->len); - end_extent_buffer_writeback(eb); - /* - * free_extent_buffer() will grab spinlock which is not - * safe in endio context. Thus here we manually dec - * the ref. - */ - atomic_dec(&eb->refs); - } + if (!uptodate) + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_clear_writeback(fs_info, page, start, len); + bio_offset += len; } - bio_put(bio); -} -static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) -{ - struct bio *bio = &bbio->bio; - struct bio_vec *bvec; - struct extent_buffer *eb; - int done; - struct bvec_iter_all iter_all; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - done = atomic_dec_and_test(&eb->io_pages); - - if (bio->bi_status || - test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { - ClearPageUptodate(page); - set_btree_ioerr(page, eb); - } - - end_page_writeback(page); - - if (!done) - continue; - - end_extent_buffer_writeback(eb); - } + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); - bio_put(bio); + bio_put(&bbio->bio); } static void prepare_eb_write(struct extent_buffer *eb) @@ -1973,7 +1693,6 @@ static void prepare_eb_write(struct extent_buffer *eb) unsigned long end; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); - atomic_set(&eb->io_pages, num_extent_pages(eb)); /* Set btree blocks beyond nritems with 0 to avoid stale content */ nritems = btrfs_header_nritems(eb); @@ -1995,63 +1714,49 @@ static void prepare_eb_write(struct extent_buffer *eb) } } -/* - * Unlike the work in write_one_eb(), we rely completely on extent locking. - * Page locking is only utilized at minimum to keep the VMM code happy. - */ -static void write_one_subpage_eb(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page = eb->pages[0]; - bool no_dirty_ebs = false; - - prepare_eb_write(eb); - - /* clear_page_dirty_for_io() in subpage helper needs page locked */ - lock_page(page); - btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); - - /* Check if this is the last dirty bit to update nr_written */ - no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, - eb->start, eb->len); - if (no_dirty_ebs) - clear_page_dirty_for_io(page); - - bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; - - submit_extent_page(bio_ctrl, eb->start, page, eb->len, - eb->start - page_offset(page)); - unlock_page(page); - /* - * Submission finished without problem, if no range of the page is - * dirty anymore, we have submitted a page. Update nr_written in wbc. - */ - if (no_dirty_ebs) - bio_ctrl->wbc->nr_to_write--; -} - static noinline_for_stack void write_one_eb(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) + struct writeback_control *wbc) { - u64 disk_bytenr = eb->start; - int i, num_pages; + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_bio *bbio; prepare_eb_write(eb); - bio_ctrl->end_io_func = end_bio_extent_buffer_writepage; - - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - clear_page_dirty_for_io(p); - set_page_writeback(p); - submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); - disk_bytenr += PAGE_SIZE; - bio_ctrl->wbc->nr_to_write--; + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), + eb->fs_info, extent_buffer_write_end_io, eb); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); + wbc_init_bio(wbc, &bbio->bio); + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + if (fs_info->nodesize < PAGE_SIZE) { + struct page *p = eb->pages[0]; + + lock_page(p); + btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); + if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, + eb->len)) { + clear_page_dirty_for_io(p); + wbc->nr_to_write--; + } + __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); + wbc_account_cgroup_owner(wbc, p, eb->len); unlock_page(p); + } else { + for (int i = 0; i < num_extent_pages(eb); i++) { + struct page *p = eb->pages[i]; + + lock_page(p); + clear_page_dirty_for_io(p); + set_page_writeback(p); + __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); + wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); + wbc->nr_to_write--; + unlock_page(p); + } } + btrfs_submit_bio(bbio, 0); } /* @@ -2068,14 +1773,13 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - int ret; /* Lock and write each dirty extent buffers in the range */ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { @@ -2121,25 +1825,13 @@ static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) if (!eb) continue; - ret = lock_extent_buffer_for_io(eb, bio_ctrl); - if (ret == 0) { - free_extent_buffer(eb); - continue; + if (lock_extent_buffer_for_io(eb, wbc)) { + write_one_eb(eb, wbc); + submitted++; } - if (ret < 0) { - free_extent_buffer(eb); - goto cleanup; - } - write_one_subpage_eb(eb, bio_ctrl); free_extent_buffer(eb); - submitted++; } return submitted; - -cleanup: - /* We hit error, end bio for the submitted extent buffers */ - submit_write_bio(bio_ctrl, ret); - return ret; } /* @@ -2162,7 +1854,7 @@ cleanup: * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, +static int submit_eb_page(struct page *page, struct writeback_control *wbc, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -2174,7 +1866,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, bio_ctrl); + return submit_eb_subpage(page, wbc); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -2207,8 +1899,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, * If for_sync, this hole will be filled with * trasnsaction commit. */ - if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL && - !bio_ctrl->wbc->for_sync) + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) ret = -EAGAIN; else ret = 0; @@ -2218,13 +1909,12 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, *eb_context = eb; - ret = lock_extent_buffer_for_io(eb, bio_ctrl); - if (ret <= 0) { + if (!lock_extent_buffer_for_io(eb, wbc)) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) btrfs_put_block_group(cache); free_extent_buffer(eb); - return ret; + return 0; } if (cache) { /* @@ -2233,7 +1923,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - write_one_eb(eb, bio_ctrl); + write_one_eb(eb, wbc); free_extent_buffer(eb); return 1; } @@ -2242,11 +1932,6 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_buffer *eb_context = NULL; - struct btrfs_bio_ctrl bio_ctrl = { - .wbc = wbc, - .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), - .extent_locked = 0, - }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; @@ -2288,7 +1973,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context); + ret = submit_eb_page(&folio->page, wbc, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2349,8 +2034,6 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - submit_write_bio(&bio_ctrl, ret); - btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -2520,38 +2203,31 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -int extent_write_locked_range(struct inode *inode, u64 start, u64 end) +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, + struct writeback_control *wbc) { bool found_error = false; int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + loff_t i_size = i_size_read(inode); u64 cur = start; - unsigned long nr_pages; - const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; - struct writeback_control wbc_writepages = { - .sync_mode = WB_SYNC_ALL, - .range_start = start, - .range_end = end + 1, - .no_cgroup_owner = 1, - }; struct btrfs_bio_ctrl bio_ctrl = { - .wbc = &wbc_writepages, - /* We're called from an async helper function */ - .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT | - wbc_to_write_flags(&wbc_writepages), - .extent_locked = 1, + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), }; + if (wbc->no_cgroup_owner) + bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); - nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> - PAGE_SHIFT; - wbc_writepages.nr_to_write = nr_pages * 2; - wbc_attach_fdatawrite_inode(&wbc_writepages, inode); while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + struct page *page; + int nr = 0; page = find_get_page(mapping, cur >> PAGE_SHIFT); /* @@ -2562,19 +2238,31 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &bio_ctrl); - ASSERT(ret <= 0); + + ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, + i_size, &nr); + if (ret == 1) + goto next_page; + + /* Make sure the mapping tag for page dirty gets cleared. */ + if (nr == 0) { + set_page_writeback(page); + end_page_writeback(page); + } + if (ret) + end_extent_writepage(page, ret, cur, cur_end); + btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur); if (ret < 0) { found_error = true; first_error = ret; } +next_page: put_page(page); cur = cur_end + 1; } submit_write_bio(&bio_ctrl, found_error ? ret : 0); - wbc_detach_inode(&wbc_writepages); if (found_error) return first_error; return ret; @@ -2588,7 +2276,6 @@ int extent_writepages(struct address_space *mapping, struct btrfs_bio_ctrl bio_ctrl = { .wbc = wbc, .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), - .extent_locked = 0, }; /* @@ -2679,8 +2366,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, - mask, NULL); + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -3421,10 +3107,9 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -int extent_buffer_under_io(const struct extent_buffer *eb) +static int extent_buffer_under_io(const struct extent_buffer *eb) { - return (atomic_read(&eb->io_pages) || - test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } @@ -3557,11 +3242,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, init_rwsem(&eb->lock); btrfs_leak_debug_add_eb(eb); - INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - atomic_set(&eb->io_pages, 0); ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); @@ -3678,9 +3361,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * adequately protected by the refcount, but the TREE_REF bit and * its corresponding reference are not. To protect against this * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io after they set eb->io_pages. Note that once io is - * initiated, TREE_REF can no longer be cleared, so that is the - * moment at which any such race is best fixed. + * which trigger io. Note that once io is initiated, TREE_REF can no + * longer be cleared, so that is the moment at which any such race is + * best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -3939,7 +3622,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; - if (!PageUptodate(p)) + if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) uptodate = 0; /* @@ -4142,13 +3825,12 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, continue; lock_page(page); btree_clear_page_dirty(page); - ClearPageError(page); unlock_page(page); } WARN_ON(atomic_read(&eb->refs) == 0); } -bool set_extent_buffer_dirty(struct extent_buffer *eb) +void set_extent_buffer_dirty(struct extent_buffer *eb) { int i; int num_pages; @@ -4183,13 +3865,14 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) eb->start, eb->len); if (subpage) unlock_page(eb->pages[0]); + percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, + eb->len, + eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) ASSERT(PageDirty(eb->pages[i])); #endif - - return was_dirty; } void clear_extent_buffer_uptodate(struct extent_buffer *eb) @@ -4242,84 +3925,54 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } -static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, - int mirror_num, - struct btrfs_tree_parent_check *check) +static void extent_buffer_read_end_io(struct btrfs_bio *bbio) { + struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *io_tree; - struct page *page = eb->pages[0]; - struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { - .opf = REQ_OP_READ, - .mirror_num = mirror_num, - .parent_check = check, - }; - int ret; + bool uptodate = !bbio->bio.bi_status; + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + u32 bio_offset = 0; - ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); - ASSERT(PagePrivate(page)); - ASSERT(check); - io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + eb->read_mirror = bbio->mirror_num; - if (wait == WAIT_NONE) { - if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state)) - return -EAGAIN; - } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - if (ret < 0) - return ret; - } + if (uptodate && + btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) + uptodate = false; - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || - PageUptodate(page) || - btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - return 0; + if (uptodate) { + set_extent_buffer_uptodate(eb); + } else { + clear_extent_buffer_uptodate(eb); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = 0; - atomic_set(&eb->io_pages, 1); - check_buffer_tree_ref(eb); - bio_ctrl.end_io_func = end_bio_extent_readpage; + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + u64 start = eb->start + bio_offset; + struct page *page = bvec->bv_page; + u32 len = bvec->bv_len; - btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); + if (uptodate) + btrfs_page_set_uptodate(fs_info, page, start, len); + else + btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - submit_extent_page(&bio_ctrl, eb->start, page, eb->len, - eb->start - page_offset(page)); - submit_one_bio(&bio_ctrl); - if (wait != WAIT_COMPLETE) { - free_extent_state(cached_state); - return 0; + bio_offset += len; } - wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, - EXTENT_LOCKED, &cached_state); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return -EIO; - return 0; + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + free_extent_buffer(eb); + + bio_put(&bbio->bio); } int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, struct btrfs_tree_parent_check *check) { - int i; - struct page *page; - int locked_pages = 0; - int all_uptodate = 1; - int num_pages; - unsigned long num_reads = 0; - struct btrfs_bio_ctrl bio_ctrl = { - .opf = REQ_OP_READ, - .mirror_num = mirror_num, - .parent_check = check, - }; + int num_pages = num_extent_pages(eb), i; + struct btrfs_bio *bbio; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -4332,87 +3985,39 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO; - if (eb->fs_info->nodesize < PAGE_SIZE) - return read_extent_buffer_subpage(eb, wait, mirror_num, check); - - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (wait == WAIT_NONE) { - /* - * WAIT_NONE is only utilized by readahead. If we can't - * acquire the lock atomically it means either the eb - * is being read out or under modification. - * Either way the eb will be or has been cached, - * readahead can exit safely. - */ - if (!trylock_page(page)) - goto unlock_exit; - } else { - lock_page(page); - } - locked_pages++; - } - /* - * We need to firstly lock all pages to make sure that - * the uptodate bit of our pages won't be affected by - * clear_extent_buffer_uptodate(). - */ - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!PageUptodate(page)) { - num_reads++; - all_uptodate = 0; - } - } - - if (all_uptodate) { - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - goto unlock_exit; - } + /* Someone else is already reading the buffer, just wait for it. */ + if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) + goto done; clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; - atomic_set(&eb->io_pages, num_reads); - /* - * It is possible for release_folio to clear the TREE_REF bit before we - * set io_pages. See check_buffer_tree_ref for a more detailed comment. - */ check_buffer_tree_ref(eb); - bio_ctrl.end_io_func = end_bio_extent_readpage; - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - - if (!PageUptodate(page)) { - ClearPageError(page); - submit_extent_page(&bio_ctrl, page_offset(page), page, - PAGE_SIZE, 0); - } else { - unlock_page(page); - } + atomic_inc(&eb->refs); + + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_READ | REQ_META, eb->fs_info, + extent_buffer_read_end_io, eb); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + memcpy(&bbio->parent_check, check, sizeof(*check)); + if (eb->fs_info->nodesize < PAGE_SIZE) { + __bio_add_page(&bbio->bio, eb->pages[0], eb->len, + eb->start - page_offset(eb->pages[0])); + } else { + for (i = 0; i < num_pages; i++) + __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); } + btrfs_submit_bio(bbio, mirror_num); - submit_one_bio(&bio_ctrl); - - if (wait != WAIT_COMPLETE) - return 0; - - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - wait_on_page_locked(page); - if (!PageUptodate(page)) +done: + if (wait == WAIT_COMPLETE) { + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return -EIO; } return 0; - -unlock_exit: - while (locked_pages > 0) { - locked_pages--; - page = eb->pages[locked_pages]; - unlock_page(page); - } - return 0; } static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, @@ -4561,18 +4166,17 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, * looked up. We don't want to complain in this case, as the page was * valid before, we just didn't write it out. Instead we want to catch * the case where we didn't actually read the block properly, which - * would have !PageUptodate && !PageError, as we clear PageError before - * reading. + * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. */ - if (fs_info->nodesize < PAGE_SIZE) { - bool uptodate, error; + if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; - uptodate = btrfs_subpage_test_uptodate(fs_info, page, - eb->start, eb->len); - error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate && !error); + if (fs_info->nodesize < PAGE_SIZE) { + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + eb->start, eb->len))) + btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); } else { - WARN_ON(!PageUptodate(page) && !PageError(page)); + WARN_ON(!PageUptodate(page)); } } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4341ad978fb8..c5fae3a7d911 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,6 +29,8 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, + /* Indicate that extent buffer pages a being read */ + EXTENT_BUFFER_READING, }; /* these are flags for __process_pages_contig */ @@ -38,7 +40,6 @@ enum { ENUM_BIT(PAGE_START_WRITEBACK), ENUM_BIT(PAGE_END_WRITEBACK), ENUM_BIT(PAGE_SET_ORDERED), - ENUM_BIT(PAGE_SET_ERROR), ENUM_BIT(PAGE_LOCK), }; @@ -79,7 +80,6 @@ struct extent_buffer { struct btrfs_fs_info *fs_info; spinlock_t refs_lock; atomic_t refs; - atomic_t io_pages; int read_mirror; struct rcu_head rcu_head; pid_t lock_owner; @@ -89,7 +89,6 @@ struct extent_buffer { struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; - struct list_head release_list; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif @@ -179,7 +178,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end); +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, + struct writeback_control *wbc); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -262,10 +262,9 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -bool set_extent_buffer_dirty(struct extent_buffer *eb); +void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); -int extent_buffer_under_io(const struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 138afa955370..0cdb3e86f29b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -364,8 +364,9 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - set_extent_bits_nowait(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, bits); + set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -380,8 +381,9 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) struct btrfs_device *device = stripe->dev; __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, bits, - NULL, GFP_NOWAIT, NULL); + stripe->physical + stripe_size - 1, + bits | EXTENT_NOWAIT, + NULL, NULL); } } @@ -502,10 +504,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&em->rb_node); } -void replace_extent_mapping(struct extent_map_tree *tree, - struct extent_map *cur, - struct extent_map *new, - int modified) +static void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) { lockdep_assert_held_write(&tree->lock); @@ -959,3 +961,95 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, return ret; } + +/* + * Split off the first pre bytes from the extent_map at [start, start + len], + * and set the block_start for it to new_logical. + * + * This function is used when an ordered_extent needs to be split. + */ +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + struct extent_map *split_pre = NULL; + struct extent_map *split_mid = NULL; + int ret = 0; + unsigned long flags; + + ASSERT(pre != 0); + ASSERT(pre < len); + + split_pre = alloc_extent_map(); + if (!split_pre) + return -ENOMEM; + split_mid = alloc_extent_map(); + if (!split_mid) { + ret = -ENOMEM; + goto out_free_pre; + } + + lock_extent(&inode->io_tree, start, start + len - 1, NULL); + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + ret = -EIO; + goto out_unlock; + } + + ASSERT(em->len == len); + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); + ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(!list_empty(&em->list)); + + flags = em->flags; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* First, replace the em with a new extent_map starting from * em->start */ + split_pre->start = em->start; + split_pre->len = pre; + split_pre->orig_start = split_pre->start; + split_pre->block_start = new_logical; + split_pre->block_len = split_pre->len; + split_pre->orig_block_len = split_pre->block_len; + split_pre->ram_bytes = split_pre->len; + split_pre->flags = flags; + split_pre->compress_type = em->compress_type; + split_pre->generation = em->generation; + + replace_extent_mapping(em_tree, em, split_pre, 1); + + /* + * Now we only have an extent_map at: + * [em->start, em->start + pre] + */ + + /* Insert the middle extent_map. */ + split_mid->start = em->start + pre; + split_mid->len = em->len - pre; + split_mid->orig_start = split_mid->start; + split_mid->block_start = em->block_start + pre; + split_mid->block_len = split_mid->len; + split_mid->orig_block_len = split_mid->block_len; + split_mid->ram_bytes = split_mid->len; + split_mid->flags = flags; + split_mid->compress_type = em->compress_type; + split_mid->generation = em->generation; + add_extent_mapping(em_tree, split_mid, 1); + + /* Once for us */ + free_extent_map(em); + /* Once for the tree */ + free_extent_map(em); + +out_unlock: + write_unlock(&em_tree->lock); + unlock_extent(&inode->io_tree, start, start + len - 1, NULL); + free_extent_map(split_mid); +out_free_pre: + free_extent_map(split_pre); + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ad311864272a..35d27c756e08 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -90,10 +90,8 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); -void replace_extent_mapping(struct extent_map_tree *tree, - struct extent_map *cur, - struct extent_map *new, - int modified); +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical); struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d1cd0a692f8e..696bf695d8eb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -94,8 +94,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return set_extent_bits(&inode->file_extent_tree, start, start + len - 1, - EXTENT_DIRTY); + return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY, NULL); } /* @@ -438,9 +438,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; - set_extent_bits(&inode->io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM); + set_extent_bit(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, NULL); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", @@ -560,8 +560,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, goto fail; } - sums->bytenr = start; - sums->len = (int)size; + sums->logical = start; + sums->len = size; offset = bytes_to_csum_size(fs_info, start - key.offset); @@ -721,20 +721,17 @@ fail: */ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { + struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; - u64 offset = bbio->file_offset; struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered = NULL; char *data; struct bvec_iter iter; struct bio_vec bvec; int index; unsigned int blockcount; - unsigned long total_bytes = 0; - unsigned long this_sum_bytes = 0; int i; unsigned nofs_flag; @@ -749,61 +746,17 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - sums->bytenr = bio->bi_iter.bi_sector << 9; + sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; index = 0; shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (!ordered) { - ordered = btrfs_lookup_ordered_extent(inode, offset); - /* - * The bio range is not covered by any ordered extent, - * must be a code logic error. - */ - if (unlikely(!ordered)) { - WARN(1, KERN_WARNING - "no ordered extent for root %llu ino %llu offset %llu\n", - inode->root->root_key.objectid, - btrfs_ino(inode), offset); - kvfree(sums); - return BLK_STS_IOERR; - } - } - blockcount = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); for (i = 0; i < blockcount; i++) { - if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && - !in_range(offset, ordered->file_offset, - ordered->num_bytes)) { - unsigned long bytes_left; - - sums->len = this_sum_bytes; - this_sum_bytes = 0; - btrfs_add_ordered_sum(ordered, sums); - btrfs_put_ordered_extent(ordered); - - bytes_left = bio->bi_iter.bi_size - total_bytes; - - nofs_flag = memalloc_nofs_save(); - sums = kvzalloc(btrfs_ordered_sum_size(fs_info, - bytes_left), GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!sums) - return BLK_STS_RESOURCE; - - sums->len = bytes_left; - ordered = btrfs_lookup_ordered_extent(inode, - offset); - ASSERT(ordered); /* Logic error */ - sums->bytenr = (bio->bi_iter.bi_sector << 9) - + total_bytes; - index = 0; - } - data = bvec_kmap_local(&bvec); crypto_shash_digest(shash, data + (i * fs_info->sectorsize), @@ -811,15 +764,28 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums->sums + index); kunmap_local(data); index += fs_info->csum_size; - offset += fs_info->sectorsize; - this_sum_bytes += fs_info->sectorsize; - total_bytes += fs_info->sectorsize; } } - this_sum_bytes = 0; + + bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); - btrfs_put_ordered_extent(ordered); + return 0; +} + +/* + * Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to + * record the updated logical address on Zone Append completion. + * Allocate just the structure with an empty sums array here for that case. + */ +blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) +{ + bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); + if (!bbio->sums) + return BLK_STS_RESOURCE; + bbio->sums->len = bbio->bio.bi_iter.bi_size; + bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + btrfs_add_ordered_sum(bbio->ordered, bbio->sums); return 0; } @@ -1086,7 +1052,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, again: next_offset = (u64)-1; found_next = 0; - bytenr = sums->bytenr + total_bytes; + bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 6be8725cd574..4ec669b69008 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -50,6 +50,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f649647392e0..392bc7d512a0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1651,7 +1651,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we @@ -1664,9 +1663,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; - if (sync) - atomic_inc(&inode->sync_writers); - if (encoded) { num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; @@ -1686,9 +1682,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, num_written = num_sync; } - if (sync) - atomic_dec(&inode->sync_writers); - current->backing_dev_info = NULL; return num_written; } @@ -1733,9 +1726,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) * several segments of stripe length (currently 64K). */ blk_start_plug(&plug); - atomic_inc(&BTRFS_I(inode)->sync_writers); ret = btrfs_fdatawrite_range(inode, start, end); - atomic_dec(&BTRFS_I(inode)->sync_writers); blk_finish_plug(&plug); return ret; @@ -3709,7 +3700,8 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | + FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cf98a3c05480..880800418075 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -292,25 +292,6 @@ out: return ret; } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - u64 needed_bytes; - int ret; - - /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + - btrfs_calc_metadata_size(fs_info, 1); - - spin_lock(&rsv->lock); - if (rsv->reserved < needed_bytes) - ret = -ENOSPC; - else - ret = 0; - spin_unlock(&rsv->lock); - return ret; -} - int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *vfs_inode) @@ -923,27 +904,31 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (!info->bitmap) { + const u64 offset = info->offset; + const u64 bytes = info->bytes; + unlink_free_space(ctl, info, true); - ret = btrfs_add_free_space(block_group, info->offset, - info->bytes); + spin_unlock(&ctl->tree_lock); kmem_cache_free(btrfs_free_space_cachep, info); + ret = btrfs_add_free_space(block_group, offset, bytes); + spin_lock(&ctl->tree_lock); } else { u64 offset = info->offset; u64 bytes = ctl->unit; - while (search_bitmap(ctl, info, &offset, &bytes, - false) == 0) { + ret = search_bitmap(ctl, info, &offset, &bytes, false); + if (ret == 0) { + bitmap_clear_bits(ctl, info, offset, bytes, true); + spin_unlock(&ctl->tree_lock); ret = btrfs_add_free_space(block_group, offset, bytes); - if (ret) - break; - bitmap_clear_bits(ctl, info, offset, bytes, true); - offset = info->offset; - bytes = ctl->unit; + spin_lock(&ctl->tree_lock); + } else { + free_bitmap(ctl, info); + ret = 0; } - free_bitmap(ctl, info); } - cond_resched(); + cond_resched_lock(&ctl->tree_lock); } return ret; } @@ -1037,7 +1022,9 @@ int load_free_space_cache(struct btrfs_block_group *block_group) block_group->bytes_super)); if (matched) { + spin_lock(&tmp_ctl.tree_lock); ret = copy_free_space_cache(block_group, &tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); /* * ret == 1 means we successfully loaded the free space cache, * so we need to re-set it here. @@ -1596,20 +1583,34 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, return bitmap_start; } -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node, int bitmap) +static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_cluster *cluster, + struct btrfs_free_space *new_entry) { - struct rb_node **p = &root->rb_node; + struct rb_root *root; + struct rb_node **p; struct rb_node *parent = NULL; - struct btrfs_free_space *info; + + lockdep_assert_held(&ctl->tree_lock); + + if (cluster) { + lockdep_assert_held(&cluster->lock); + root = &cluster->root; + } else { + root = &ctl->free_space_offset; + } + + p = &root->rb_node; while (*p) { + struct btrfs_free_space *info; + parent = *p; info = rb_entry(parent, struct btrfs_free_space, offset_index); - if (offset < info->offset) { + if (new_entry->offset < info->offset) { p = &(*p)->rb_left; - } else if (offset > info->offset) { + } else if (new_entry->offset > info->offset) { p = &(*p)->rb_right; } else { /* @@ -1625,7 +1626,7 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * found a bitmap, we want to go left, or before * logically. */ - if (bitmap) { + if (new_entry->bitmap) { if (info->bitmap) { WARN_ON_ONCE(1); return -EEXIST; @@ -1641,8 +1642,8 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, } } - rb_link_node(node, parent, p); - rb_insert_color(node, root); + rb_link_node(&new_entry->offset_index, parent, p); + rb_insert_color(&new_entry->offset_index, root); return 0; } @@ -1705,6 +1706,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, struct rb_node *n = ctl->free_space_offset.rb_node; struct btrfs_free_space *entry = NULL, *prev = NULL; + lockdep_assert_held(&ctl->tree_lock); + /* find entry that is closest to the 'offset' */ while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); @@ -1814,6 +1817,8 @@ static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { + lockdep_assert_held(&ctl->tree_lock); + rb_erase(&info->offset_index, &ctl->free_space_offset); rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; @@ -1832,9 +1837,10 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, { int ret = 0; + lockdep_assert_held(&ctl->tree_lock); + ASSERT(info->bytes || info->bitmap); - ret = tree_insert_offset(&ctl->free_space_offset, info->offset, - &info->offset_index, (info->bitmap != NULL)); + ret = tree_insert_offset(ctl, NULL, info); if (ret) return ret; @@ -1862,6 +1868,8 @@ static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, if (RB_EMPTY_NODE(&info->bytes_index)) return; + lockdep_assert_held(&ctl->tree_lock); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); } @@ -2447,6 +2455,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, u64 offset = info->offset; u64 bytes = info->bytes; const bool is_trimmed = btrfs_free_space_trimmed(info); + struct rb_node *right_prev = NULL; /* * first we want to see if there is free space adjacent to the range we @@ -2454,9 +2463,11 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, * cover the entire range */ right_info = tree_search_offset(ctl, offset + bytes, 0, 0); - if (right_info && rb_prev(&right_info->offset_index)) - left_info = rb_entry(rb_prev(&right_info->offset_index), - struct btrfs_free_space, offset_index); + if (right_info) + right_prev = rb_prev(&right_info->offset_index); + + if (right_prev) + left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); @@ -2969,9 +2980,10 @@ static void __btrfs_return_cluster_to_free_space( struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; struct rb_node *node; + lockdep_assert_held(&ctl->tree_lock); + spin_lock(&cluster->lock); if (cluster->block_group != block_group) { spin_unlock(&cluster->lock); @@ -2984,15 +2996,14 @@ static void __btrfs_return_cluster_to_free_space( node = rb_first(&cluster->root); while (node) { - bool bitmap; + struct btrfs_free_space *entry; entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); RB_CLEAR_NODE(&entry->offset_index); - bitmap = (entry->bitmap != NULL); - if (!bitmap) { + if (!entry->bitmap) { /* Merging treats extents as if they were new */ if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; @@ -3010,8 +3021,7 @@ static void __btrfs_return_cluster_to_free_space( entry->bytes; } } - tree_insert_offset(&ctl->free_space_offset, - entry->offset, &entry->offset_index, bitmap); + tree_insert_offset(ctl, NULL, entry); rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, entry_less); } @@ -3324,6 +3334,8 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, unsigned long total_found = 0; int ret; + lockdep_assert_held(&ctl->tree_lock); + i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); want_bits = bytes_to_bits(bytes, ctl->unit); @@ -3385,8 +3397,7 @@ again: */ RB_CLEAR_NODE(&entry->bytes_index); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 1); + ret = tree_insert_offset(ctl, cluster, entry); ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, @@ -3414,6 +3425,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, u64 max_extent; u64 total_size = 0; + lockdep_assert_held(&ctl->tree_lock); + entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) return -ENOSPC; @@ -3476,8 +3489,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, rb_erase(&entry->offset_index, &ctl->free_space_offset); rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 0); + ret = tree_insert_offset(ctl, cluster, entry); total_size += entry->bytes; ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); @@ -3671,7 +3683,7 @@ static int do_trimming(struct btrfs_block_group *block_group, __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); - if (start + bytes < reserved_start + reserved_bytes) + if (end < reserved_end) __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); __btrfs_add_free_space(block_group, start, bytes, trim_state); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index a855e0483e03..33b4da3271b1 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -101,8 +101,6 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *inode); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index b21da1446f2a..045ddce32eca 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1280,7 +1280,10 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) goto abort; btrfs_global_root_delete(free_space_root); + + spin_lock(&fs_info->trans_lock); list_del(&free_space_root->dirty_list); + spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(free_space_root->node); btrfs_clear_buffer_dirty(trans, free_space_root->node); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 0d98fc5f6f44..203d2a267828 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -543,7 +543,6 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other two. */ struct btrfs_workqueue *workers; - struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct workqueue_struct *endio_workers; @@ -577,6 +576,7 @@ struct btrfs_fs_info { s32 dirty_metadata_batch; s32 delalloc_batch; + /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -643,7 +643,6 @@ struct btrfs_fs_info { */ refcount_t scrub_workers_refcnt; struct workqueue_struct *scrub_workers; - struct workqueue_struct *scrub_wr_completion_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -854,7 +853,7 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { - return fs_info->zone_size > 0; + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; } /* diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index b80aeb715701..ede43b6c6559 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -60,6 +60,22 @@ struct btrfs_truncate_control { bool clear_extent_range; }; +/* + * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two + * separate u32s. These two functions convert between the two representations. + */ +static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) +{ + return (flags | ((u64)ro_flags << 32)); +} + +static inline void btrfs_inode_split_flags(u64 inode_item_flags, + u32 *flags, u32 *ro_flags) +{ + *flags = (u32)inode_item_flags; + *ro_flags = (u32)(inode_item_flags >> 32); +} + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7fcafcc5292c..dbbb67293e34 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,6 +70,7 @@ #include "verity.h" #include "super.h" #include "orphan.h" +#include "backref.h" struct btrfs_iget_args { u64 ino; @@ -100,6 +101,18 @@ struct btrfs_rename_ctx { u64 index; }; +/* + * Used by data_reloc_print_warning_inode() to pass needed info for filename + * resolution and output of error message. + */ +struct data_reloc_warn { + struct btrfs_path path; + struct btrfs_fs_info *fs_info; + u64 extent_item_size; + u64 logical; + int mirror_num; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -122,12 +135,198 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); +static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) +{ + struct data_reloc_warn *warn = warn_ctx; + struct btrfs_fs_info *fs_info = warn->fs_info; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key key; + unsigned int nofs_flag; + u32 nlink; + int ret; + + local_root = btrfs_get_fs_root(fs_info, root, true); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + /* This makes the path point to (inum INODE_ITEM ioff). */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); + if (ret) { + btrfs_put_root(local_root); + btrfs_release_path(&warn->path); + goto err; + } + + eb = warn->path.nodes[0]; + inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(&warn->path); + + nofs_flag = memalloc_nofs_save(); + ipath = init_ipath(4096, local_root, &warn->path); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(ipath)) { + btrfs_put_root(local_root); + ret = PTR_ERR(ipath); + ipath = NULL; + /* + * -ENOMEM, not a critical error, just output an generic error + * without filename. + */ + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", + warn->logical, warn->mirror_num, root, inum, offset); + return ret; + } + ret = paths_from_inode(inum, ipath); + if (ret < 0) + goto err; + + /* + * We deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (int i = 0; i < ipath->fspath->elem_cnt; i++) { + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", + warn->logical, warn->mirror_num, root, inum, offset, + fs_info->sectorsize, nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + } + + btrfs_put_root(local_root); + free_ipath(ipath); + return 0; + +err: + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", + warn->logical, warn->mirror_num, root, inum, offset, ret); + + free_ipath(ipath); + return ret; +} + +/* + * Do extra user-friendly error output (e.g. lookup all the affected files). + * + * Return true if we succeeded doing the backref lookup. + * Return false if such lookup failed, and has to fallback to the old error message. + */ +static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, + const u8 *csum, const u8 *csum_expected, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_path path = { 0 }; + struct btrfs_key found_key = { 0 }; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + const u32 csum_size = fs_info->csum_size; + u64 logical; + u64 flags; + u32 item_size; + int ret; + + mutex_lock(&fs_info->reloc_mutex); + logical = btrfs_get_reloc_bg_bytenr(fs_info); + mutex_unlock(&fs_info->reloc_mutex); + + if (logical == U64_MAX) { + btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); + btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + inode->root->root_key.objectid, btrfs_ino(inode), file_off, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + return; + } + + logical += file_off; + btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + inode->root->root_key.objectid, + btrfs_ino(inode), file_off, logical, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + + ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); + if (ret < 0) { + btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", + logical, ret); + return; + } + eb = path.nodes[0]; + ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size(eb, path.slots[0]); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + unsigned long ptr = 0; + u64 ref_root; + u8 ref_level; + + while (true) { + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); + if (ret < 0) { + btrfs_warn_rl(fs_info, + "failed to resolve tree backref for logical %llu: %d", + logical, ret); + break; + } + if (ret > 0) + break; + + btrfs_warn_rl(fs_info, +"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", + logical, mirror_num, + (ref_level ? "node" : "leaf"), + ref_level, ref_root); + } + btrfs_release_path(&path); + } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + struct data_reloc_warn reloc_warn = { 0 }; + + btrfs_release_path(&path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = logical - found_key.objectid; + ctx.fs_info = fs_info; + + reloc_warn.logical = logical; + reloc_warn.extent_item_size = found_key.offset; + reloc_warn.mirror_num = mirror_num; + reloc_warn.fs_info = fs_info; + + iterate_extent_inodes(&ctx, true, + data_reloc_print_warning_inode, &reloc_warn); + } +} + static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; const u32 csum_size = root->fs_info->csum_size; + /* For data reloc tree, it's better to do a backref lookup instead. */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + return print_data_reloc_error(inode, logical_start, csum, + csum_expected, mirror_num); + /* Output without objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, @@ -636,6 +835,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) { struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -750,7 +950,7 @@ again: /* Compression level is applied here and only here */ ret = btrfs_compress_pages( compress_type | (fs_info->compress_level << 4), - inode->vfs_inode.i_mapping, start, + mapping, start, pages, &nr_pages, &total_in, @@ -793,9 +993,9 @@ cont: unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; - unsigned long page_error_op; - page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; + if (ret < 0) + mapping_set_error(mapping, -EIO); /* * inline extent creation worked or returned error, @@ -812,7 +1012,6 @@ cont: clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | - page_error_op | PAGE_END_WRITEBACK); /* @@ -934,6 +1133,12 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, unsigned long nr_written = 0; int page_started = 0; int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .range_start = start, + .range_end = end, + .no_cgroup_owner = 1, + }; /* * Call cow_file_range() to run the delalloc range directly, since we @@ -954,8 +1159,6 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, const u64 page_start = page_offset(locked_page); const u64 page_end = page_start + PAGE_SIZE - 1; - btrfs_page_set_error(inode->root->fs_info, locked_page, - page_start, PAGE_SIZE); set_page_writeback(locked_page); end_page_writeback(locked_page); end_extent_writepage(locked_page, ret, page_start, page_end); @@ -965,7 +1168,10 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, } /* All pages will be unlocked, including @locked_page */ - return extent_write_locked_range(&inode->vfs_inode, start, end); + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); + wbc_detach_inode(&wbc); + return ret; } static int submit_one_async_extent(struct btrfs_inode *inode, @@ -976,6 +1182,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ordered_extent *ordered; struct btrfs_key ins; struct page *locked_page = NULL; struct extent_map *em; @@ -1037,7 +1244,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ + ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ async_extent->ram_size, /* num_bytes */ async_extent->ram_size, /* ram_bytes */ ins.objectid, /* disk_bytenr */ @@ -1045,8 +1252,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode, 0, /* offset */ 1 << BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - if (ret) { + if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); + ret = PTR_ERR(ordered); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1055,11 +1263,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); - - btrfs_submit_compressed_write(inode, start, /* file_offset */ - async_extent->ram_size, /* num_bytes */ - ins.objectid, /* disk_bytenr */ - ins.offset, /* compressed_len */ + btrfs_submit_compressed_write(ordered, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, true); @@ -1074,12 +1278,13 @@ out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK | PAGE_SET_ERROR); + PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); goto done; } @@ -1287,6 +1492,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { + struct btrfs_ordered_extent *ordered; + cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, @@ -1311,16 +1518,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, - ins.objectid, cur_alloc_size, 0, - 1 << BTRFS_ORDERED_REGULAR, - BTRFS_COMPRESS_NONE); - if (ret) + ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, + ram_size, ins.objectid, cur_alloc_size, + 0, 1 << BTRFS_ORDERED_REGULAR, + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { + ret = PTR_ERR(ordered); goto out_drop_extent_cache; + } if (btrfs_is_data_reloc_root(root)) { - ret = btrfs_reloc_clone_csums(inode, start, - cur_alloc_size); + ret = btrfs_reloc_clone_csums(ordered); + /* * Only drop cache here, and process as normal. * @@ -1337,6 +1546,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, start + ram_size - 1, false); } + btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1494,7 +1704,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to * always adjust ->async_delalloc_pages as its paired with the init - * happening in cow_file_range_async + * happening in run_delalloc_compressed */ if (async_chunk->inode) submit_compressed_extents(async_chunk); @@ -1521,58 +1731,36 @@ static noinline void async_cow_free(struct btrfs_work *work) kvfree(async_cow); } -static int cow_file_range_async(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) +static bool run_delalloc_compressed(struct btrfs_inode *inode, + struct writeback_control *wbc, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; - u64 cur_end; u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); int i; - bool should_compress; unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); - unlock_extent(&inode->io_tree, start, end, NULL); - - if (inode->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { - num_chunks = 1; - should_compress = false; - } else { - should_compress = true; - } - nofs_flag = memalloc_nofs_save(); ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); + if (!ctx) + return false; - if (!ctx) { - unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING; - unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK | PAGE_SET_ERROR; - - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); - return -ENOMEM; - } + unlock_extent(&inode->io_tree, start, end, NULL); + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; atomic_set(&ctx->num_chunks, num_chunks); for (i = 0; i < num_chunks; i++) { - if (should_compress) - cur_end = min(end, start + SZ_512K - 1); - else - cur_end = end; + u64 cur_end = min(end, start + SZ_512K - 1); /* * igrab is called higher up in the call chain, take only the @@ -1633,13 +1821,14 @@ static int cow_file_range_async(struct btrfs_inode *inode, start = cur_end + 1; } *page_started = 1; - return 0; + return true; } static noinline int run_delalloc_zoned(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + struct writeback_control *wbc) { u64 done_offset = end; int ret; @@ -1671,8 +1860,8 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, account_page_redirty(locked_page); } locked_page_done = true; - extent_write_locked_range(&inode->vfs_inode, start, done_offset); - + extent_write_locked_range(&inode->vfs_inode, start, done_offset, + wbc); start = done_offset + 1; } @@ -1947,6 +2136,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.writeback_path = true; while (1) { + struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; @@ -1954,6 +2144,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 ram_bytes; u64 nocow_end; int extent_type; + bool is_prealloc; nocow = false; @@ -2092,8 +2283,8 @@ out_check: } nocow_end = cur_offset + nocow_args.num_bytes - 1; - - if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; + if (is_prealloc) { u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; @@ -2109,29 +2300,22 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, - cur_offset, nocow_args.num_bytes, - nocow_args.num_bytes, - nocow_args.disk_bytenr, - nocow_args.num_bytes, 0, - 1 << BTRFS_ORDERED_PREALLOC, - BTRFS_COMPRESS_NONE); - if (ret) { + } + + ordered = btrfs_alloc_ordered_extent(inode, cur_offset, + nocow_args.num_bytes, nocow_args.num_bytes, + nocow_args.disk_bytenr, nocow_args.num_bytes, 0, + is_prealloc + ? (1 << BTRFS_ORDERED_PREALLOC) + : (1 << BTRFS_ORDERED_NOCOW), + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { + if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); - goto error; } - } else { - ret = btrfs_add_ordered_extent(inode, cur_offset, - nocow_args.num_bytes, - nocow_args.num_bytes, - nocow_args.disk_bytenr, - nocow_args.num_bytes, - 0, - 1 << BTRFS_ORDERED_NOCOW, - BTRFS_COMPRESS_NONE); - if (ret) - goto error; + ret = PTR_ERR(ordered); + goto error; } if (nocow) { @@ -2145,8 +2329,8 @@ out_check: * extent_clear_unlock_delalloc() in error handler * from freeing metadata of created ordered extent. */ - ret = btrfs_reloc_clone_csums(inode, cur_offset, - nocow_args.num_bytes); + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, EXTENT_LOCKED | @@ -2214,7 +2398,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { - int ret; + int ret = 0; const bool zoned = btrfs_is_zoned(inode->root->fs_info); /* @@ -2235,19 +2419,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); - } else if (!btrfs_inode_can_compress(inode) || - !inode_need_compress(inode, start, end)) { - if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written); - else - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1, NULL); - } else { - set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); - ret = cow_file_range_async(inode, wbc, locked_page, start, end, - page_started, nr_written); + goto out; } + + if (btrfs_inode_can_compress(inode) && + inode_need_compress(inode, start, end) && + run_delalloc_compressed(inode, wbc, locked_page, start, + end, page_started, nr_written)) + goto out; + + if (zoned) + ret = run_delalloc_zoned(inode, locked_page, start, end, + page_started, nr_written, wbc); + else + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1, NULL); + +out: ASSERT(ret <= 0); if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, @@ -2515,125 +2703,42 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -/* - * Split off the first pre bytes from the extent_map at [start, start + len] - * - * This function is intended to be used only for extract_ordered_extent(). - */ -static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - struct extent_map *split_pre = NULL; - struct extent_map *split_mid = NULL; - int ret = 0; - unsigned long flags; - - ASSERT(pre != 0); - ASSERT(pre < len); - - split_pre = alloc_extent_map(); - if (!split_pre) - return -ENOMEM; - split_mid = alloc_extent_map(); - if (!split_mid) { - ret = -ENOMEM; - goto out_free_pre; - } - - lock_extent(&inode->io_tree, start, start + len - 1, NULL); - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - ret = -EIO; - goto out_unlock; - } - - ASSERT(em->len == len); - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); - ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); - ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); - ASSERT(!list_empty(&em->list)); - - flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - - /* First, replace the em with a new extent_map starting from * em->start */ - split_pre->start = em->start; - split_pre->len = pre; - split_pre->orig_start = split_pre->start; - split_pre->block_start = em->block_start; - split_pre->block_len = split_pre->len; - split_pre->orig_block_len = split_pre->block_len; - split_pre->ram_bytes = split_pre->len; - split_pre->flags = flags; - split_pre->compress_type = em->compress_type; - split_pre->generation = em->generation; - - replace_extent_mapping(em_tree, em, split_pre, 1); - - /* - * Now we only have an extent_map at: - * [em->start, em->start + pre] - */ - - /* Insert the middle extent_map. */ - split_mid->start = em->start + pre; - split_mid->len = em->len - pre; - split_mid->orig_start = split_mid->start; - split_mid->block_start = em->block_start + pre; - split_mid->block_len = split_mid->len; - split_mid->orig_block_len = split_mid->block_len; - split_mid->ram_bytes = split_mid->len; - split_mid->flags = flags; - split_mid->compress_type = em->compress_type; - split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); - - /* Once for us */ - free_extent_map(em); - /* Once for the tree */ - free_extent_map(em); - -out_unlock: - write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1, NULL); - free_extent_map(split_mid); -out_free_pre: - free_extent_map(split_pre); - return ret; -} - -int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered) +static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered) { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; - struct btrfs_inode *inode = bbio->inode; - u64 ordered_len = ordered->num_bytes; - int ret = 0; + struct btrfs_ordered_extent *new; + int ret; /* Must always be called for the beginning of an ordered extent. */ if (WARN_ON_ONCE(start != ordered->disk_bytenr)) return -EINVAL; /* No need to split if the ordered extent covers the entire bio. */ - if (ordered->disk_num_bytes == len) + if (ordered->disk_num_bytes == len) { + refcount_inc(&ordered->refs); + bbio->ordered = ordered; return 0; - - ret = btrfs_split_ordered_extent(ordered, len); - if (ret) - return ret; + } /* * Don't split the extent_map for NOCOW extents, as we're writing into * a pre-existing one. */ - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - return 0; + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); + if (ret) + return ret; + } - return split_extent_map(inode, bbio->file_offset, ordered_len, len); + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return PTR_ERR(new); + bbio->ordered = new; + return 0; } /* @@ -2651,7 +2756,7 @@ static int add_pending_csums(struct btrfs_trans_handle *trans, trans->adding_csums = true; if (!csum_root) csum_root = btrfs_csum_root(trans->fs_info, - sum->bytenr); + sum->logical); ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) @@ -2689,8 +2794,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state, - GFP_NOFS); + EXTENT_DELALLOC_NEW, cached_state); next: search_start = extent_map_end(em); free_extent_map(em); @@ -2723,8 +2827,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, - cached_state); + return set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2847,7 +2951,6 @@ out_page: mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); clear_page_dirty_for_io(page); - SetPageError(page); } btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); unlock_page(page); @@ -3068,7 +3171,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3103,15 +3206,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - /* A valid ->physical implies a write on a sequential zone. */ - if (ordered_extent->physical != (u64)-1) { - btrfs_rewrite_logical_zoned(ordered_extent); + if (btrfs_is_zoned(fs_info)) btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); - } else if (btrfs_is_data_reloc_root(inode->root)) { - btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); - } if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3279,6 +3376,14 @@ out: return ret; } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) +{ + if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + btrfs_finish_ordered_zoned(ordered); + return btrfs_finish_one_ordered(ordered); +} + void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) @@ -4226,7 +4331,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - 0); + false); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); @@ -4801,7 +4906,7 @@ again: if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL, GFP_NOFS); + EXTENT_NORESERVE, NULL); out_unlock: if (ret) { @@ -7670,8 +7775,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos += submitted; length -= submitted; if (write) - btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, - pos, length, false); + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, NULL); @@ -7701,12 +7806,14 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, bio->bi_status); } - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, - dip->bytes, !bio->bi_status); - else + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + btrfs_finish_ordered_extent(bbio->ordered, NULL, + dip->file_offset, dip->bytes, + !bio->bi_status); + } else { unlock_extent(&inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); + } bbio->bio.bi_private = bbio->private; iomap_dio_bio_end_io(bio); @@ -7742,7 +7849,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { - btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); + bbio->bio.bi_status = errno_to_blk_status(ret); + btrfs_dio_end_io(bbio); return; } } @@ -8236,7 +8344,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; - u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(&inode->vfs_inode, @@ -8293,7 +8401,15 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + /* + * We have reserved 2 metadata units when we started the transaction and + * min_size matches 1 unit, so this should never fail, but if it does, + * it's not critical we just fail truncation. + */ + if (WARN_ON(ret)) { + btrfs_end_transaction(trans); + goto out; + } trans->block_rsv = rsv; @@ -8341,7 +8457,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + /* + * We have reserved 2 metadata units when we started the + * transaction and min_size matches 1 unit, so this should never + * fail, but if it does, it's not critical we just fail truncation. + */ + if (WARN_ON(ret)) + break; + trans->block_rsv = rsv; } @@ -8468,7 +8591,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); - atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); @@ -8639,7 +8761,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_bytes, blocksize) + - ALIGN(delalloc_bytes, blocksize)) >> 9; + ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT; return 0; } @@ -8795,9 +8917,9 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), - BTRFS_I(old_inode), 1); + BTRFS_I(old_inode), true); btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), - BTRFS_I(new_inode), 1); + BTRFS_I(new_inode), true); } /* src is a subvolume */ @@ -9063,7 +9185,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), - BTRFS_I(old_inode), 1); + BTRFS_I(old_inode), true); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); @@ -10170,6 +10292,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_io_tree *io_tree = &inode->io_tree; struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; int compression; size_t orig_count; u64 start, end; @@ -10346,14 +10469,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, + ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, ins.objectid, ins.offset, encoded->unencoded_offset, (1 << BTRFS_ORDERED_ENCODED) | (1 << BTRFS_ORDERED_COMPRESSED), compression); - if (ret) { + if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); + ret = PTR_ERR(ordered); goto out_free_reserved; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -10365,8 +10489,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, - ins.offset, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); ret = orig_count; goto out; @@ -10903,7 +11026,6 @@ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = noop_direct_IO, .invalidate_folio = btrfs_invalidate_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2fa36f694daa..edbbd5cf23fc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -649,6 +649,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; + /* Tree log can't currently deal with an inode which is a new root. */ + btrfs_set_log_full_commit(trans); ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) @@ -757,10 +759,7 @@ out: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); - if (ret) - btrfs_end_transaction(trans); - else - ret = btrfs_commit_transaction(trans); + btrfs_end_transaction(trans); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -3113,6 +3112,13 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, struct btrfs_trans_handle *trans; u64 transid; + /* + * Start orphan cleanup here for the given root in case it hasn't been + * started already by other means. Errors are handled in the other + * functions during transaction commit. + */ + btrfs_orphan_cleanup(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) @@ -3134,14 +3140,13 @@ out: static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, void __user *argp) { - u64 transid; + /* By default wait for the current transaction. */ + u64 transid = 0; - if (argp) { + if (argp) if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; - } else { - transid = 0; /* current trans */ - } + return btrfs_wait_for_commit(fs_info, transid); } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 3a496b0d3d2b..7979449a58d6 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ - /* Longest entry: btrfs-free-space-00 */ - char names[BTRFS_MAX_LEVEL][20]; + /* Longest entry: btrfs-block-group-00 */ + char names[BTRFS_MAX_LEVEL][24]; struct lock_class_key keys[BTRFS_MAX_LEVEL]; } btrfs_lockdep_keysets[] = { { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, @@ -72,6 +72,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, { .id = 0, DEFINE_NAME("tree") }, }; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 3a095b9c6373..d3fcfc628a4f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -88,9 +88,9 @@ struct list_head *lzo_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); - workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 310a05cf95ef..23fc11af498a 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -252,14 +252,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif -#ifdef CONFIG_BTRFS_ASSERT -void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} -#endif - void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { btrfs_err(fs_info, diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index ac2d1982ba3d..deedc1a168e2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,14 +4,23 @@ #define BTRFS_MESSAGES_H #include <linux/types.h> +#include <linux/printk.h> +#include <linux/bug.h> struct btrfs_fs_info; +/* + * We want to be able to override this in btrfs-progs. + */ +#ifdef __KERNEL__ + static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } +#endif + #ifdef CONFIG_PRINTK #define btrfs_printk(fs_info, fmt, args...) \ @@ -160,7 +169,11 @@ do { \ } while (0) #ifdef CONFIG_BTRFS_ASSERT -void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line); + +#define btrfs_assertfail(expr, file, line) ({ \ + pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \ + BUG(); \ +}) #define ASSERT(expr) \ (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 768583a440e1..005751a12911 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -143,4 +143,24 @@ static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, return NULL; } +static inline bool bitmap_test_range_all_set(const unsigned long *addr, + unsigned long start, + unsigned long nbits) +{ + unsigned long found_zero; + + found_zero = find_next_zero_bit(addr, start + nbits, start); + return (found_zero == start + nbits); +} + +static inline bool bitmap_test_range_all_zero(const unsigned long *addr, + unsigned long start, + unsigned long nbits) +{ + unsigned long found_set; + + found_set = find_next_bit(addr, start + nbits, start); + return (found_set == start + nbits); +} + #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a9778a91511e..a629532283bc 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -146,35 +146,11 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/* - * Add an ordered extent to the per-inode tree. - * - * @inode: Inode that this extent is for. - * @file_offset: Logical offset in file where the extent starts. - * @num_bytes: Logical length of extent in file. - * @ram_bytes: Full length of unencoded data. - * @disk_bytenr: Offset of extent on disk. - * @disk_num_bytes: Size of extent on disk. - * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). - * @compress_type: Compression algorithm used for data. - * - * Most of these parameters correspond to &struct btrfs_file_extent_item. The - * tree is given a single reference on the ordered extent that was inserted, and - * the returned pointer is given a second reference. - * - * Return: the new ordered extent or error pointer. - */ -struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( - struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type) +static struct btrfs_ordered_extent *alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, + u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, + u64 offset, unsigned long flags, int compress_type) { - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; - struct rb_node *node; struct btrfs_ordered_extent *entry; int ret; @@ -184,7 +160,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) return ERR_PTR(ret); - ret = 0; } else { /* * The ordered extent has reserved qgroup space, release now @@ -209,15 +184,7 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; - entry->physical = (u64)-1; - - ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); entry->flags = flags; - - percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, - fs_info->delalloc_batch); - - /* one ref for the tree */ refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); @@ -226,15 +193,40 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); + /* + * We don't need the count_max_extents here, we can assume that all of + * that work has been done at higher layers, so this is truly the + * smallest the extent is going to get. + */ + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); + + return entry; +} + +static void insert_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *node; + trace_btrfs_ordered_extent_add(inode, entry); + percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes, + fs_info->delalloc_batch); + + /* One ref for the tree. */ + refcount_inc(&entry->refs); + spin_lock_irq(&tree->lock); - node = tree_insert(&tree->tree, file_offset, - &entry->rb_node); + node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", - file_offset); + entry->file_offset); spin_unlock_irq(&tree->lock); spin_lock(&root->ordered_extent_lock); @@ -248,43 +240,43 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( spin_unlock(&fs_info->ordered_root_lock); } spin_unlock(&root->ordered_extent_lock); - - /* - * We don't need the count_max_extents here, we can assume that all of - * that work has been done at higher layers, so this is truly the - * smallest the extent is going to get. - */ - spin_lock(&inode->lock); - btrfs_mod_outstanding_extents(inode, 1); - spin_unlock(&inode->lock); - - /* One ref for the returned entry to match semantics of lookup. */ - refcount_inc(&entry->refs); - - return entry; } /* - * Add a new btrfs_ordered_extent for the range, but drop the reference instead - * of returning it to the caller. + * Add an ordered extent to the per-inode tree. + * + * @inode: Inode that this extent is for. + * @file_offset: Logical offset in file where the extent starts. + * @num_bytes: Logical length of extent in file. + * @ram_bytes: Full length of unencoded data. + * @disk_bytenr: Offset of extent on disk. + * @disk_num_bytes: Size of extent on disk. + * @offset: Offset into unencoded data where file data starts. + * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @compress_type: Compression algorithm used for data. + * + * Most of these parameters correspond to &struct btrfs_file_extent_item. The + * tree is given a single reference on the ordered extent that was inserted, and + * the returned pointer is given a second reference. + * + * Return: the new ordered extent or error pointer. */ -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type) +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) { - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, - ram_bytes, disk_bytenr, - disk_num_bytes, offset, flags, - compress_type); + struct btrfs_ordered_extent *entry; - if (IS_ERR(ordered)) - return PTR_ERR(ordered); - btrfs_put_ordered_extent(ordered); + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); - return 0; + entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes, + disk_bytenr, disk_num_bytes, offset, flags, + compress_type); + if (!IS_ERR(entry)) + insert_ordered_extent(entry); + return entry; } /* @@ -311,6 +303,90 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } +static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, + u64 len, bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + lockdep_assert_held(&inode->ordered_tree.lock); + + if (page) { + ASSERT(page->mapping); + ASSERT(page_offset(page) <= file_offset); + ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + + /* + * Ordered (Private2) bit indicates whether we still have + * pending io unfinished for the ordered extent. + * + * If there's no such bit, we need to skip to next range. + */ + if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + return false; + btrfs_page_clear_ordered(fs_info, page, file_offset, len); + } + + /* Now we're fine to update the accounting. */ + if (WARN_ON_ONCE(len > ordered->bytes_left)) { + btrfs_crit(fs_info, +"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", + inode->root->root_key.objectid, btrfs_ino(inode), + ordered->file_offset, ordered->num_bytes, + len, ordered->bytes_left); + ordered->bytes_left = 0; + } else { + ordered->bytes_left -= len; + } + + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + + if (ordered->bytes_left) + return false; + + /* + * All the IO of the ordered extent is finished, we need to queue + * the finish_func to be executed. + */ + set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags); + cond_wake_up(&ordered->wait); + refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_mark_finished(inode, ordered); + return true; +} + +static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? + fs_info->endio_freespace_worker : fs_info->endio_write_workers; + + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); +} + +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + unsigned long flags; + bool ret; + + trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); + + spin_lock_irqsave(&inode->ordered_tree.lock, flags); + ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + + if (ret) + btrfs_queue_ordered_fn(ordered); + return ret; +} + /* * Mark all ordered extents io inside the specified range finished. * @@ -329,22 +405,11 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, u64 num_bytes, bool uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_workqueue *wq; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; u64 cur = file_offset; - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - if (page) - ASSERT(page->mapping && page_offset(page) <= file_offset && - file_offset + num_bytes <= page_offset(page) + PAGE_SIZE); - spin_lock_irqsave(&tree->lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; @@ -397,50 +462,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (page) { - /* - * Ordered (Private2) bit indicates whether we still - * have pending io unfinished for the ordered extent. - * - * If there's no such bit, we need to skip to next range. - */ - if (!btrfs_page_test_ordered(fs_info, page, cur, len)) { - cur += len; - continue; - } - btrfs_page_clear_ordered(fs_info, page, cur, len); - } - - /* Now we're fine to update the accounting */ - if (unlikely(len > entry->bytes_left)) { - WARN_ON(1); - btrfs_crit(fs_info, -"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu", - inode->root->root_key.objectid, - btrfs_ino(inode), - entry->file_offset, - entry->num_bytes, - len, entry->bytes_left); - entry->bytes_left = 0; - } else { - entry->bytes_left -= len; - } - - if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - - /* - * All the IO of the ordered extent is finished, we need to queue - * the finish_func to be executed. - */ - if (entry->bytes_left == 0) { - set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - cond_wake_up(&entry->wait); - refcount_inc(&entry->refs); - trace_btrfs_ordered_extent_mark_finished(inode, entry); + if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { spin_unlock_irqrestore(&tree->lock, flags); - btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &entry->work); + btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&tree->lock, flags); } cur += len; @@ -564,7 +588,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, freespace_inode = btrfs_is_free_space_inode(btrfs_inode); btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); - /* This is paired with btrfs_add_ordered_extent. */ + /* This is paired with btrfs_alloc_ordered_extent. */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); @@ -1117,17 +1141,22 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, } /* Split out a new ordered extent for this first @len bytes of @ordered. */ -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) +struct btrfs_ordered_extent *btrfs_split_ordered_extent( + struct btrfs_ordered_extent *ordered, u64 len) { - struct inode *inode = ordered->inode; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; u64 disk_bytenr = ordered->disk_bytenr; - unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; + unsigned long flags = ordered->flags; + struct btrfs_ordered_sum *sum, *tmpsum; + struct btrfs_ordered_extent *new; struct rb_node *node; + u64 offset = 0; - trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + trace_btrfs_ordered_extent_split(inode, ordered); ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); @@ -1136,18 +1165,27 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) * reduce the original extent to a zero length either. */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) - return -EINVAL; - /* We cannot split once ordered extent is past end_bio. */ - if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) - return -EINVAL; + return ERR_PTR(-EINVAL); + /* We cannot split partially completed ordered extents. */ + if (ordered->bytes_left) { + ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) + return ERR_PTR(-EINVAL); + } /* We cannot split a compressed ordered extent. */ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) - return -EINVAL; - /* Checksum list should be empty. */ - if (WARN_ON_ONCE(!list_empty(&ordered->list))) - return -EINVAL; + return ERR_PTR(-EINVAL); - spin_lock_irq(&tree->lock); + new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, + len, 0, flags, ordered->compress_type); + if (IS_ERR(new)) + return new; + + /* One ref for the tree. */ + refcount_inc(&new->refs); + + spin_lock_irq(&root->ordered_extent_lock); + spin_lock(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; rb_erase(node, &tree->tree); @@ -1159,26 +1197,48 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) ordered->disk_bytenr += len; ordered->num_bytes -= len; ordered->disk_num_bytes -= len; - ordered->bytes_left -= len; + + if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { + ASSERT(ordered->bytes_left == 0); + new->bytes_left = 0; + } else { + ordered->bytes_left -= len; + } + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) { + if (ordered->truncated_len > len) { + ordered->truncated_len -= len; + } else { + new->truncated_len = ordered->truncated_len; + ordered->truncated_len = 0; + } + } + + list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) { + if (offset == len) + break; + list_move_tail(&sum->list, &new->list); + offset += sum->len; + } /* Re-insert the node */ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", - ordered->file_offset); + ordered->file_offset); - spin_unlock_irq(&tree->lock); - - /* - * The splitting extent is already counted and will be added again in - * btrfs_add_ordered_extent(). Subtract len to avoid double counting. - */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); + node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + new->file_offset); + spin_unlock(&tree->lock); - return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, - disk_bytenr, len, 0, flags, - ordered->compress_type); + list_add_tail(&new->root_extent_list, &root->ordered_extents); + root->nr_ordered_extents++; + spin_unlock_irq(&root->ordered_extent_lock); + return new; } int __init ordered_data_init(void) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f0f1138d23c3..173bd5c5df26 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -14,13 +14,13 @@ struct btrfs_ordered_inode_tree { }; struct btrfs_ordered_sum { - /* bytenr is the start of this extent on disk */ - u64 bytenr; - /* - * this is the length in bytes covered by the sums array below. + * Logical start address and length for of the blocks covered by + * the sums array. */ - int len; + u64 logical; + u32 len; + struct list_head list; /* last field is a variable length array of csums */ u8 sums[]; @@ -151,12 +151,6 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; - - /* - * Used to reverse-map physical address returned from ZONE_APPEND write - * command in a workqueue context - */ - u64 physical; }; static inline void @@ -167,11 +161,15 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, u64 num_bytes, bool uptodate); @@ -183,10 +181,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, u64 offset, unsigned long flags, int compress_type); -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, @@ -212,7 +206,8 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, struct extent_state **cached_state); bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len); +struct btrfs_ordered_extent *btrfs_split_ordered_extent( + struct btrfs_ordered_extent *ordered, u64 len); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 497b9dbd8a13..aa06d9ca911d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -49,7 +49,7 @@ const char *btrfs_root_name(const struct btrfs_key *key, char *buf) return buf; } -static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +static void print_chunk(const struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); int i; @@ -62,7 +62,7 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) btrfs_stripe_offset_nr(eb, chunk, i)); } } -static void print_dev_item(struct extent_buffer *eb, +static void print_dev_item(const struct extent_buffer *eb, struct btrfs_dev_item *dev_item) { pr_info("\t\tdev item devid %llu total_bytes %llu bytes used %llu\n", @@ -70,7 +70,7 @@ static void print_dev_item(struct extent_buffer *eb, btrfs_device_total_bytes(eb, dev_item), btrfs_device_bytes_used(eb, dev_item)); } -static void print_extent_data_ref(struct extent_buffer *eb, +static void print_extent_data_ref(const struct extent_buffer *eb, struct btrfs_extent_data_ref *ref) { pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n", @@ -80,7 +80,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot, int type) +static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -169,7 +169,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) WARN_ON(ptr > end); } -static void print_uuid_item(struct extent_buffer *l, unsigned long offset, +static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { @@ -191,7 +191,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, * Helper to output refs and locking status of extent buffer. Useful to debug * race condition related problems. */ -static void print_eb_refs_lock(struct extent_buffer *eb) +static void print_eb_refs_lock(const struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", @@ -199,7 +199,7 @@ static void print_eb_refs_lock(struct extent_buffer *eb) #endif } -void btrfs_print_leaf(struct extent_buffer *l) +void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; int i; @@ -355,7 +355,7 @@ void btrfs_print_leaf(struct extent_buffer *l) } } -void btrfs_print_tree(struct extent_buffer *c, bool follow) +void btrfs_print_tree(const struct extent_buffer *c, bool follow) { struct btrfs_fs_info *fs_info; int i; u32 nr; diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8c3e9319ec4e..c42bc666d5ee 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,8 +9,8 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 -void btrfs_print_leaf(struct extent_buffer *l); -void btrfs_print_tree(struct extent_buffer *c, bool follow); +void btrfs_print_leaf(const struct extent_buffer *l); +void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f41da7ac360d..da1f84a0eb29 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1232,12 +1232,23 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) int ret = 0; /* - * We need to have subvol_sem write locked, to prevent races between - * concurrent tasks trying to disable quotas, because we will unlock - * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + * We need to have subvol_sem write locked to prevent races with + * snapshot creation. */ lockdep_assert_held_write(&fs_info->subvol_sem); + /* + * Lock the cleaner mutex to prevent races with concurrent relocation, + * because relocation may be building backrefs for blocks of the quota + * root while we are deleting the root. This is like dropping fs roots + * of deleted snapshots/subvolumes, we need the same protection. + * + * This also prevents races between concurrent tasks trying to disable + * quotas, because we will unlock and relock qgroup_ioctl_lock across + * BTRFS_FS_QUOTA_ENABLED changes. + */ + mutex_lock(&fs_info->cleaner_mutex); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; @@ -1301,7 +1312,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) goto out; } + spin_lock(&fs_info->trans_lock); list_del("a_root->dirty_list); + spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(quota_root->node); btrfs_clear_buffer_dirty(trans, quota_root->node); @@ -1317,6 +1330,7 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); + mutex_unlock(&fs_info->cleaner_mutex); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2fab37f062de..f37b925d587f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1079,7 +1079,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - u64 last_end = last->bi_iter.bi_sector << 9; + u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; last_end += last->bi_iter.bi_size; /* @@ -1099,7 +1099,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio = bio_alloc(stripe->dev->bdev, max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), op, GFP_NOFS); - bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); @@ -2747,3 +2747,48 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) start_async_work(rbio, scrub_rbio_work_locked); } + +/* + * This is for scrub call sites where we already have correct data contents. + * This allows us to avoid reading data stripes again. + * + * Unfortunately here we have to do page copy, other than reusing the pages. + * This is due to the fact rbio has its own page management for its cache. + */ +void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, + struct page **data_pages, u64 data_logical) +{ + const u64 offset_in_full_stripe = data_logical - + rbio->bioc->full_stripe_logical; + const int page_index = offset_in_full_stripe >> PAGE_SHIFT; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int ret; + + /* + * If we hit ENOMEM temporarily, but later at + * raid56_parity_submit_scrub_rbio() time it succeeded, we just do + * the extra read, not a big deal. + * + * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, + * the bio would got proper error number set. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return; + + /* data_logical must be at stripe boundary and inside the full stripe. */ + ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); + ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); + + for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { + struct page *dst = rbio->stripe_pages[page_nr + page_index]; + struct page *src = data_pages[page_nr]; + + memcpy_page(dst, 0, src, 0, PAGE_SIZE); + for (int sector_nr = sectors_per_page * page_index; + sector_nr < sectors_per_page * (page_index + 1); + sector_nr++) + rbio->stripe_sectors[sector_nr].uptodate = true; + } +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0f7f31c8cb98..0e84c9c9293f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -193,6 +193,9 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); +void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, + struct page **data_pages, u64 data_logical); + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59a06499c647..25a3361caedc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -174,8 +174,8 @@ static void mark_block_processed(struct reloc_control *rc, in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; - set_extent_bits(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY); + set_extent_bit(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); } node->processed = 1; } @@ -3051,9 +3051,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, u64 boundary_end = boundary_start + fs_info->sectorsize - 1; - set_extent_bits(&BTRFS_I(inode)->io_tree, - boundary_start, boundary_end, - EXTENT_BOUNDARY); + set_extent_bit(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY, NULL); } unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, &cached_state); @@ -4342,29 +4342,25 @@ out: * cloning checksum properly handles the nodatasum extents. * it also saves CPU time to re-calculate the checksum. */ -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) +int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) { + struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_root *csum_root; - struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered; - int ret; - u64 disk_bytenr; - u64 new_bytenr; + u64 disk_bytenr = ordered->file_offset + inode->index_cnt; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); LIST_HEAD(list); + int ret; - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); - - disk_bytenr = file_pos + inode->index_cnt; - csum_root = btrfs_csum_root(fs_info, disk_bytenr); ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0, false); + disk_bytenr + ordered->num_bytes - 1, + &list, 0, false); if (ret) - goto out; + return ret; while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); + struct btrfs_ordered_sum *sums = + list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); /* @@ -4379,14 +4375,11 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) * disk_len vs real len like with real inodes since it's all * disk length. */ - new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; - sums->bytenr = new_bytenr; - + sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; btrfs_add_ordered_sum(ordered, sums); } -out: - btrfs_put_ordered_extent(ordered); - return ret; + + return 0; } int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, @@ -4523,3 +4516,19 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = clone_backref_node(trans, rc, root, reloc_root); return ret; } + +/* + * Get the current bytenr for the block group which is being relocated. + * + * Return U64_MAX if no running relocation. + */ +u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info) +{ + u64 logical = U64_MAX; + + lockdep_assert_held(&fs_info->reloc_mutex); + + if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) + logical = fs_info->reloc_ctl->block_group->start; + return logical; +} diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 2041a86186de..77d69f6ae967 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -8,7 +8,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *r int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); @@ -19,5 +19,6 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_should_ignore_reloc_root(struct btrfs_root *root); +u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 16c228344cbb..4cae41bd6de0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -177,7 +177,6 @@ struct scrub_ctx { struct btrfs_fs_info *fs_info; int first_free; int cur_stripe; - struct list_head csum_list; atomic_t cancel_req; int readonly; int sectors_per_bio; @@ -309,17 +308,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) scrub_pause_off(fs_info); } -static void scrub_free_csums(struct scrub_ctx *sctx) -{ - while (!list_empty(&sctx->csum_list)) { - struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sctx->csum_list, - struct btrfs_ordered_sum, list); - list_del(&sum->list); - kfree(sum); - } -} - static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; @@ -330,7 +318,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) release_scrub_stripe(&sctx->stripes[i]); - scrub_free_csums(sctx); kfree(sctx); } @@ -352,7 +339,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - INIT_LIST_HEAD(&sctx->csum_list); for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { int ret; @@ -479,11 +465,8 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * struct extent_buffer *eb; struct btrfs_extent_item *ei; struct scrub_warning swarn; - unsigned long ptr = 0; u64 flags = 0; - u64 ref_root; u32 item_size; - u8 ref_level = 0; int ret; /* Super block error, no need to search extent tree. */ @@ -513,19 +496,28 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * item_size = btrfs_item_size(eb, path->slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - do { + unsigned long ptr = 0; + u8 ref_level; + u64 ref_root; + + while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); + if (ret < 0) { + btrfs_warn(fs_info, + "failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); + break; + } + if (ret > 0) + break; btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", - errstr, swarn.logical, - btrfs_dev_name(dev), - swarn.physical, - ref_level ? "node" : "leaf", - ret < 0 ? -1 : ref_level, - ret < 0 ? -1 : ref_root); - } while (ret != 1); + errstr, swarn.logical, btrfs_dev_name(dev), + swarn.physical, (ref_level ? "node" : "leaf"), + ref_level, ref_root); + } btrfs_release_path(path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; @@ -546,48 +538,6 @@ out: btrfs_free_path(path); } -static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) -{ - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - return 2; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - return 3; - else - return (int)bioc->num_stripes; -} - -static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, - u64 full_stripe_logical, - int nstripes, int mirror, - int *stripe_index, - u64 *stripe_offset) -{ - int i; - - if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ? - nstripes - 1 : nstripes - 2; - - /* RAID5/6 */ - for (i = 0; i < nr_data_stripes; i++) { - const u64 data_stripe_start = full_stripe_logical + - (i * BTRFS_STRIPE_LEN); - - if (logical >= data_stripe_start && - logical < data_stripe_start + BTRFS_STRIPE_LEN) - break; - } - - *stripe_index = i; - *stripe_offset = (logical - full_stripe_logical) & - BTRFS_STRIPE_LEN_MASK; - } else { - /* The other RAID type */ - *stripe_index = mirror; - *stripe_offset = 0; - } -} - static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) { int ret = 0; @@ -924,8 +874,9 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* For scrub, our mirror_num should always start at 1. */ ASSERT(stripe->mirror_num >= 1); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - stripe->logical, &mapped_len, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + stripe->logical, &mapped_len, &bioc, + NULL, NULL, 1); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. @@ -1957,8 +1908,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio->bi_end_io = raid56_scrub_wait_endio; btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc, NULL, NULL, 1); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -1972,6 +1923,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_dec(fs_info); goto out; } + /* Use the recovered stripes as cache to avoid read them from disk again. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + + raid56_parity_cache_data_pages(rbio, stripe->pages, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); + } raid56_parity_submit_scrub_rbio(rbio); wait_for_completion_io(&io_done); ret = blk_status_to_errno(bio->bi_status); @@ -2740,17 +2698,12 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { struct workqueue_struct *scrub_workers = fs_info->scrub_workers; - struct workqueue_struct *scrub_wr_comp = - fs_info->scrub_wr_completion_workers; fs_info->scrub_workers = NULL; - fs_info->scrub_wr_completion_workers = NULL; mutex_unlock(&fs_info->scrub_lock); if (scrub_workers) destroy_workqueue(scrub_workers); - if (scrub_wr_comp) - destroy_workqueue(scrub_wr_comp); } } @@ -2761,7 +2714,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { struct workqueue_struct *scrub_workers = NULL; - struct workqueue_struct *scrub_wr_comp = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -2769,21 +2721,17 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - scrub_workers = alloc_workqueue("btrfs-scrub", flags, - is_dev_replace ? 1 : max_active); + if (is_dev_replace) + scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); + else + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) - goto fail_scrub_workers; - - scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); - if (!scrub_wr_comp) - goto fail_scrub_wr_completion_workers; + return -ENOMEM; mutex_lock(&fs_info->scrub_lock); if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { - ASSERT(fs_info->scrub_workers == NULL && - fs_info->scrub_wr_completion_workers == NULL); + ASSERT(fs_info->scrub_workers == NULL); fs_info->scrub_workers = scrub_workers; - fs_info->scrub_wr_completion_workers = scrub_wr_comp; refcount_set(&fs_info->scrub_workers_refcnt, 1); mutex_unlock(&fs_info->scrub_lock); return 0; @@ -2794,10 +2742,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, ret = 0; - destroy_workqueue(scrub_wr_comp); -fail_scrub_wr_completion_workers: destroy_workqueue(scrub_workers); -fail_scrub_workers: return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index af2e153543a5..8bfd44750efe 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1774,9 +1774,21 @@ static int read_symlink(struct btrfs_root *root, ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); + if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) { + ret = -EUCLEAN; + btrfs_crit(root->fs_info, +"send: found symlink extent that is not inline, ino %llu root %llu extent type %d", + ino, btrfs_root_id(root), type); + goto out; + } compression = btrfs_file_extent_compression(path->nodes[0], ei); - BUG_ON(type != BTRFS_FILE_EXTENT_INLINE); - BUG_ON(compression); + if (unlikely(compression != BTRFS_COMPRESS_NONE)) { + ret = -EUCLEAN; + btrfs_crit(root->fs_info, +"send: found symlink extent with compression, ino %llu root %llu compression type %d", + ino, btrfs_root_id(root), compression); + goto out; + } off = btrfs_file_extent_inline_start(ei); len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index dd46b978ac2c..1b999c6e4193 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -100,9 +100,6 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector subpage_info->uptodate_offset = cur; cur += nr_bits; - subpage_info->error_offset = cur; - cur += nr_bits; - subpage_info->dirty_offset = cur; cur += nr_bits; @@ -367,28 +364,6 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, unlock_page(page); } -static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start, - unsigned int nbits) -{ - unsigned int found_zero; - - found_zero = find_next_zero_bit(addr, start + nbits, start); - if (found_zero == start + nbits) - return true; - return false; -} - -static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start, - unsigned int nbits) -{ - unsigned int found_set; - - found_set = find_next_bit(addr, start + nbits, start); - if (found_set == start + nbits) - return true; - return false; -} - #define subpage_calc_start_bit(fs_info, page, name, start, len) \ ({ \ unsigned int start_bit; \ @@ -438,35 +413,6 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } -void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, - error, start, len); - unsigned long flags; - - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - SetPageError(page); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, - error, start, len); - unsigned long flags; - - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, error)) - ClearPageError(page); - spin_unlock_irqrestore(&subpage->lock, flags); -} - void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { @@ -628,7 +574,6 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); -IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); @@ -696,7 +641,6 @@ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); -IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, PageDirty); IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, @@ -767,3 +711,44 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, /* Have writers, use proper subpage helper to end it */ btrfs_page_end_writer_lock(fs_info, page, start, len); } + +#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ + bitmap_cut(dst, subpage->bitmaps, 0, \ + subpage_info->name##_offset, subpage_info->bitmap_nr_bits) + +void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; + struct btrfs_subpage *subpage; + unsigned long uptodate_bitmap; + unsigned long error_bitmap; + unsigned long dirty_bitmap; + unsigned long writeback_bitmap; + unsigned long ordered_bitmap; + unsigned long checked_bitmap; + unsigned long flags; + + ASSERT(PagePrivate(page) && page->private); + ASSERT(subpage_info); + subpage = (struct btrfs_subpage *)page->private; + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); + + dump_page(page, "btrfs subpage dump"); + btrfs_warn(fs_info, +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", + start, len, page_offset(page), + subpage_info->bitmap_nr_bits, &uptodate_bitmap, + subpage_info->bitmap_nr_bits, &error_bitmap, + subpage_info->bitmap_nr_bits, &dirty_bitmap, + subpage_info->bitmap_nr_bits, &writeback_bitmap, + subpage_info->bitmap_nr_bits, &ordered_bitmap, + subpage_info->bitmap_nr_bits, &checked_bitmap); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 0e80ad336904..5cbf67ccbdeb 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -8,17 +8,17 @@ /* * Extra info for subpapge bitmap. * - * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into + * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into * one larger bitmap. * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- error_offset /- dirty_offset + * /- uptodate_offset /- dirty_offset /- ordered_offset * | | | * v v v - * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o| + * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| * |<- bitmap_nr_bits ->| - * |<--------------- total_nr_bits ---------------->| + * |<----------------- total_nr_bits ------------------>| */ struct btrfs_subpage_info { /* Number of bits for each bitmap */ @@ -32,7 +32,6 @@ struct btrfs_subpage_info { * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. */ unsigned int uptodate_offset; - unsigned int error_offset; unsigned int dirty_offset; unsigned int writeback_offset; unsigned int ordered_offset; @@ -141,7 +140,6 @@ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); -DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); @@ -154,5 +152,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct page *page); void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); +void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index efeb1a9d040a..8b1c1225245e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1631,7 +1631,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, old_pool_size, new_pool_size); btrfs_workqueue_set_max(fs_info->workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); workqueue_set_max_active(fs_info->endio_workers, new_pool_size); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index dfc5c7fa6038..f6bc6d738555 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -159,7 +159,7 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); + set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -190,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); + set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); + set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -503,8 +503,8 @@ static int test_find_first_clear_extent_bit(void) * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M */ - set_extent_bits(&tree, SZ_1M, SZ_4M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); @@ -516,8 +516,8 @@ static int test_find_first_clear_extent_bit(void) } /* Now add 32M-64M so that we have a hole between 4M-32M */ - set_extent_bits(&tree, SZ_32M, SZ_64M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); /* * Request first hole starting at 12M, we should get 4M-32M @@ -548,7 +548,7 @@ static int test_find_first_clear_extent_bit(void) * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag * being unset in this range, we should get the entry in range 64M-72M */ - set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, CHUNK_TRIMMED); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8b6a99b8d7f6..cf306351b148 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -374,8 +374,6 @@ loop: spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); - INIT_LIST_HEAD(&cur_trans->releasing_ebs); - spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES); @@ -1056,7 +1054,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { bool wait_writeback = false; @@ -1092,7 +1089,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, cond_resched(); start = end + 1; } - atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers); return werr; } @@ -1688,7 +1684,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, @@ -2484,13 +2483,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - /* - * At this point, we should have written all the tree blocks allocated - * in this transaction. So it's now safe to free the redirtyied extent - * buffers. - */ - btrfs_free_redirty_list(cur_trans); - ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index fa728ab80826..8e9fa23bd7fe 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -94,9 +94,6 @@ struct btrfs_transaction { */ atomic_t pending_ordered; wait_queue_head_t pending_wait; - - spinlock_t releasing_ebs_lock; - struct list_head releasing_ebs; }; enum { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 2138e9fc0564..038dfa8f1788 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -25,10 +25,10 @@ #include "compression.h" #include "volumes.h" #include "misc.h" -#include "btrfs_inode.h" #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "inode-item.h" /* * Error message should follow the following format: @@ -1620,9 +1620,10 @@ static int check_inode_ref(struct extent_buffer *leaf, /* * Common point to switch the item-specific validation. */ -static int check_leaf_item(struct extent_buffer *leaf, - struct btrfs_key *key, int slot, - struct btrfs_key *prev_key) +static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, + struct btrfs_key *key, + int slot, + struct btrfs_key *prev_key) { int ret = 0; struct btrfs_chunk *chunk; @@ -1671,10 +1672,13 @@ static int check_leaf_item(struct extent_buffer *leaf, ret = check_extent_data_ref(leaf, key, slot); break; } - return ret; + + if (ret) + return BTRFS_TREE_BLOCK_INVALID_ITEM; + return BTRFS_TREE_BLOCK_CLEAN; } -static int check_leaf(struct extent_buffer *leaf, bool check_item_data) +enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; /* No valid key type is 0, so all key should be larger than this key */ @@ -1687,7 +1691,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) generic_err(leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_LEVEL; } /* @@ -1710,32 +1714,32 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } /* Unknown tree */ if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OWNER; } /* EXTENT_TREE_V2 can have empty extent trees. */ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) - return 0; + return BTRFS_TREE_BLOCK_CLEAN; if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } - return 0; + return BTRFS_TREE_BLOCK_CLEAN; } if (unlikely(nritems == 0)) - return 0; + return BTRFS_TREE_BLOCK_CLEAN; /* * Check the following things to make sure this is a good leaf, and @@ -1751,7 +1755,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; u64 item_data_end; - int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1762,7 +1765,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) prev_key.objectid, prev_key.type, prev_key.offset, key.objectid, key.type, key.offset); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } item_data_end = (u64)btrfs_item_offset(leaf, slot) + @@ -1781,7 +1784,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) generic_err(leaf, slot, "unexpected item end, have %llu expect %u", item_data_end, item_end_expected); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* @@ -1793,7 +1796,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) generic_err(leaf, slot, "slot end outside of leaf, have %llu expect range [0, %u]", item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* Also check if the item pointer overlaps with btrfs item. */ @@ -1804,16 +1807,22 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } - if (check_item_data) { + /* + * We only want to do this if WRITTEN is set, otherwise the leaf + * may be in some intermediate state and won't appear valid. + */ + if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { + enum btrfs_tree_block_status ret; + /* * Check if the item size and content meet other * criteria */ ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (unlikely(ret < 0)) + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) return ret; } @@ -1822,21 +1831,21 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) prev_key.offset = key.offset; } - return 0; + return BTRFS_TREE_BLOCK_CLEAN; } -int btrfs_check_leaf_full(struct extent_buffer *leaf) +int btrfs_check_leaf(struct extent_buffer *leaf) { - return check_leaf(leaf, true); -} -ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO); + enum btrfs_tree_block_status ret; -int btrfs_check_leaf_relaxed(struct extent_buffer *leaf) -{ - return check_leaf(leaf, false); + ret = __btrfs_check_leaf(leaf); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; + return 0; } +ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO); -int btrfs_check_node(struct extent_buffer *node) +enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) { struct btrfs_fs_info *fs_info = node->fs_info; unsigned long nr = btrfs_header_nritems(node); @@ -1844,13 +1853,12 @@ int btrfs_check_node(struct extent_buffer *node) int slot; int level = btrfs_header_level(node); u64 bytenr; - int ret = 0; if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_LEVEL; } if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) { btrfs_crit(fs_info, @@ -1858,7 +1866,7 @@ int btrfs_check_node(struct extent_buffer *node) btrfs_header_owner(node), node->start, nr == 0 ? "small" : "large", nr, BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } for (slot = 0; slot < nr - 1; slot++) { @@ -1869,15 +1877,13 @@ int btrfs_check_node(struct extent_buffer *node) if (unlikely(!bytenr)) { generic_err(node, slot, "invalid NULL node pointer"); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; } if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) { generic_err(node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; } if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { @@ -1886,12 +1892,20 @@ int btrfs_check_node(struct extent_buffer *node) key.objectid, key.type, key.offset, next_key.objectid, next_key.type, next_key.offset); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } } -out: - return ret; + return BTRFS_TREE_BLOCK_CLEAN; +} + +int btrfs_check_node(struct extent_buffer *node) +{ + enum btrfs_tree_block_status ret; + + ret = __btrfs_check_node(node); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; + return 0; } ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); @@ -1949,3 +1963,61 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) } return 0; } + +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + int found_level; + struct btrfs_key found_key; + int ret; + + found_level = btrfs_header_level(eb); + if (found_level != level) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree level check failed\n"); + btrfs_err(fs_info, +"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", + eb->start, level, found_level); + return -EIO; + } + + if (!first_key) + return 0; + + /* + * For live tree block (new tree blocks in current transaction), + * we need proper lock context to avoid race, which is impossible here. + * So we only checks tree blocks which is read from disk, whose + * generation <= fs_info->last_trans_committed. + */ + if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + return 0; + + /* We have @first_key, so this @eb must have at least one item */ + if (btrfs_header_nritems(eb) == 0) { + btrfs_err(fs_info, + "invalid tree nritems, bytenr=%llu nritems=0 expect >0", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return -EUCLEAN; + } + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + ret = btrfs_comp_cpu_keys(first_key, &found_key); + + if (ret) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree first key check failed\n"); + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, parent_transid, first_key->objectid, + first_key->type, first_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + } + return ret; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index bfb5efa4e01f..3c2a02a72f64 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -40,22 +40,33 @@ struct btrfs_tree_parent_check { u8 level; }; -/* - * Comprehensive leaf checker. - * Will check not only the item pointers, but also every possible member - * in item data. - */ -int btrfs_check_leaf_full(struct extent_buffer *leaf); +enum btrfs_tree_block_status { + BTRFS_TREE_BLOCK_CLEAN, + BTRFS_TREE_BLOCK_INVALID_NRITEMS, + BTRFS_TREE_BLOCK_INVALID_PARENT_KEY, + BTRFS_TREE_BLOCK_BAD_KEY_ORDER, + BTRFS_TREE_BLOCK_INVALID_LEVEL, + BTRFS_TREE_BLOCK_INVALID_FREE_SPACE, + BTRFS_TREE_BLOCK_INVALID_OFFSETS, + BTRFS_TREE_BLOCK_INVALID_BLOCKPTR, + BTRFS_TREE_BLOCK_INVALID_ITEM, + BTRFS_TREE_BLOCK_INVALID_OWNER, +}; /* - * Less strict leaf checker. - * Will only check item pointers, not reading item data. + * Exported simply for btrfs-progs which wants to have the + * btrfs_tree_block_status return codes. */ -int btrfs_check_leaf_relaxed(struct extent_buffer *leaf); +enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf); +enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node); + +int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d2755d5e338b..365a1cc0a3c3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -859,10 +859,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); csum_root = btrfs_csum_root(fs_info, - sums->bytenr); + sums->logical); if (!ret) ret = btrfs_del_csums(trans, csum_root, - sums->bytenr, + sums->logical, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, @@ -3252,7 +3252,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, * Returns 1 if the inode was logged before in the transaction, 0 if it was not, * and < 0 on error. */ -static int inode_logged(struct btrfs_trans_handle *trans, +static int inode_logged(const struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path_in) { @@ -4056,14 +4056,14 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - BUG_ON(ret == 0); /* Logic error */ - if (ret < 0) - break; - - if (path->slots[0] == 0) + if (ret < 0) { break; + } else if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } - path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -4221,7 +4221,7 @@ static int log_csums(struct btrfs_trans_handle *trans, struct btrfs_root *log_root, struct btrfs_ordered_sum *sums) { - const u64 lock_end = sums->bytenr + sums->len - 1; + const u64 lock_end = sums->logical + sums->len - 1; struct extent_state *cached_state = NULL; int ret; @@ -4239,7 +4239,7 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, &cached_state); if (ret) return ret; @@ -4252,11 +4252,11 @@ static int log_csums(struct btrfs_trans_handle *trans, * some checksums missing in the fs/subvolume tree. So just delete (or * trim and adjust) any existing csum items in the log for this range. */ - ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); + ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, &cached_state); return ret; @@ -5303,7 +5303,7 @@ out: * multiple times when multiple tasks have joined the same log transaction. */ static bool need_log_inode(const struct btrfs_trans_handle *trans, - const struct btrfs_inode *inode) + struct btrfs_inode *inode) { /* * If a directory was not modified, no dentries added or removed, we can @@ -5321,7 +5321,7 @@ static bool need_log_inode(const struct btrfs_trans_handle *trans, * logged_trans will be 0, in which case we have to fully log it since * logged_trans is a transient field, not persisted. */ - if (inode->logged_trans == trans->transid && + if (inode_logged(trans, inode, NULL) == 1 && !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) return false; @@ -7309,7 +7309,7 @@ error: */ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - int for_rename) + bool for_rename) { /* * when we're logging a file, if it hasn't been renamed @@ -7325,18 +7325,25 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, inode->last_unlink_trans = trans->transid; mutex_unlock(&inode->log_mutex); + if (!for_rename) + return; + /* - * if this directory was already logged any new - * names for this file/dir will get recorded + * If this directory was already logged, any new names will be logged + * with btrfs_log_new_name() and old names will be deleted from the log + * tree with btrfs_del_dir_entries_in_log() or with + * btrfs_del_inode_ref_in_log(). */ - if (dir->logged_trans == trans->transid) + if (inode_logged(trans, dir, NULL) == 1) return; /* - * if the inode we're about to unlink was logged, - * the log will be properly updated for any new names + * If the inode we're about to unlink was logged before, the log will be + * properly updated with the new name with btrfs_log_new_name() and the + * old name removed with btrfs_del_dir_entries_in_log() or with + * btrfs_del_inode_ref_in_log(). */ - if (inode->logged_trans == trans->transid) + if (inode_logged(trans, inode, NULL) == 1) return; /* @@ -7346,13 +7353,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * properly. So, we have to be conservative and force commits * so the new name gets discovered. */ - if (for_rename) - goto record; - - /* we can safely do the unlink without any special recording */ - return; - -record: mutex_lock(&dir->log_mutex); dir->last_unlink_trans = trans->transid; mutex_unlock(&dir->log_mutex); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index bdeb5216718f..a550a8a375cd 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -100,7 +100,7 @@ void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - int for_rename); + bool for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index a555baa0143a..3df6153d5d5a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -226,21 +226,32 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; - int ret; + int ret = 0; if (!tree_mod_need_log(eb->fs_info, eb)) return 0; tm = alloc_tree_mod_elem(eb, slot, op); if (!tm) - return -ENOMEM; + ret = -ENOMEM; if (tree_mod_dont_log(eb->fs_info, eb)) { kfree(tm); + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ return 0; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; } ret = tree_mod_log_insert(eb->fs_info, tm); +out_unlock: write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) kfree(tm); @@ -248,6 +259,26 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, return ret; } +static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) + return ERR_PTR(-ENOMEM); + + tm->logical = eb->start; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + RB_CLEAR_NODE(&tm->node); + + return tm; +} + int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) @@ -262,35 +293,46 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, return 0; tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { + if (!tm_list) { ret = -ENOMEM; - goto free_tms; + goto lock; } - tm->logical = eb->start; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); + if (IS_ERR(tm)) { + ret = PTR_ERR(tm); + tm = NULL; + goto lock; + } for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(eb->fs_info, eb)) +lock: + if (tree_mod_dont_log(eb->fs_info, eb)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } locked = true; /* + * We previously failed to allocate memory and we need to log, so we + * have to fail. + */ + if (ret != 0) + goto free_tms; + + /* * When we override something during the move, we log these removals. * This can only happen when we move towards the beginning of the * buffer, i.e. dst_slot < src_slot. @@ -310,10 +352,12 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, return 0; free_tms: - for (i = 0; i < nr_items; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); - kfree(tm_list[i]); + if (tm_list) { + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); + kfree(tm_list[i]); + } } if (locked) write_unlock(&eb->fs_info->tree_mod_log_lock); @@ -363,14 +407,14 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, GFP_NOFS); if (!tm_list) { ret = -ENOMEM; - goto free_tms; + goto lock; } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } } @@ -378,7 +422,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; - goto free_tms; + goto lock; } tm->logical = new_root->start; @@ -387,14 +431,28 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, tm->generation = btrfs_header_generation(old_root); tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; - if (tree_mod_dont_log(fs_info, NULL)) +lock: + if (tree_mod_dont_log(fs_info, NULL)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } if (tm_list) ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); if (!ret) ret = tree_mod_log_insert(fs_info, tm); +out_unlock: write_unlock(&fs_info->tree_mod_log_lock); if (ret) goto free_tms; @@ -486,9 +544,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, struct btrfs_fs_info *fs_info = dst->fs_info; int ret = 0; struct tree_mod_elem **tm_list = NULL; - struct tree_mod_elem **tm_list_add, **tm_list_rem; + struct tree_mod_elem **tm_list_add = NULL; + struct tree_mod_elem **tm_list_rem = NULL; int i; bool locked = false; + struct tree_mod_elem *dst_move_tm = NULL; + struct tree_mod_elem *src_move_tm = NULL; + u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset; + u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items); if (!tree_mod_need_log(fs_info, NULL)) return 0; @@ -498,8 +561,30 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } + + if (dst_move_nr_items) { + dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, + dst_offset, dst_move_nr_items); + if (IS_ERR(dst_move_tm)) { + ret = PTR_ERR(dst_move_tm); + dst_move_tm = NULL; + goto lock; + } + } + if (src_move_nr_items) { + src_move_tm = tree_mod_log_alloc_move(src, src_offset, + src_offset + nr_items, + src_move_nr_items); + if (IS_ERR(src_move_tm)) { + ret = PTR_ERR(src_move_tm); + src_move_tm = NULL; + goto lock; + } + } tm_list_add = tm_list; tm_list_rem = tm_list + nr_items; @@ -508,21 +593,40 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, BTRFS_MOD_LOG_KEY_REMOVE); if (!tm_list_rem[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, BTRFS_MOD_LOG_KEY_ADD); if (!tm_list_add[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(fs_info, NULL)) +lock: + if (tree_mod_dont_log(fs_info, NULL)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } locked = true; + /* + * We previously failed to allocate memory and we need to log, so we + * have to fail. + */ + if (ret != 0) + goto free_tms; + + if (dst_move_tm) { + ret = tree_mod_log_insert(fs_info, dst_move_tm); + if (ret) + goto free_tms; + } for (i = 0; i < nr_items; i++) { ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); if (ret) @@ -531,6 +635,11 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, if (ret) goto free_tms; } + if (src_move_tm) { + ret = tree_mod_log_insert(fs_info, src_move_tm); + if (ret) + goto free_tms; + } write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); @@ -538,10 +647,18 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, return 0; free_tms: - for (i = 0; i < nr_items * 2; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); - kfree(tm_list[i]); + if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) + rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); + kfree(dst_move_tm); + if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) + rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); + kfree(src_move_tm); + if (tm_list) { + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } } if (locked) write_unlock(&fs_info->tree_mod_log_lock); @@ -562,22 +679,38 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) nritems = btrfs_header_nritems(eb); tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(eb->fs_info, eb)) +lock: + if (tree_mod_dont_log(eb->fs_info, eb)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); +out_unlock: write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) goto free_tms; @@ -586,9 +719,11 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) return 0; free_tms: - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } return ret; } @@ -664,10 +799,27 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, unsigned long o_dst; unsigned long o_src; unsigned long p_size = sizeof(struct btrfs_key_ptr); + /* + * max_slot tracks the maximum valid slot of the rewind eb at every + * step of the rewind. This is in contrast with 'n' which eventually + * matches the number of items, but can be wrong during moves or if + * removes overlap on already valid slots (which is probably separately + * a bug). We do this to validate the offsets of memmoves for rewinding + * moves and detect invalid memmoves. + * + * Since a rewind eb can start empty, max_slot is a signed integer with + * a special meaning for -1, which is that no slot is valid to move out + * of. Any other negative value is invalid. + */ + int max_slot; + int move_src_end_slot; + int move_dst_end_slot; n = btrfs_header_nritems(eb); + max_slot = n - 1; read_lock(&fs_info->tree_mod_log_lock); while (tm && tm->seq >= time_seq) { + ASSERT(max_slot >= -1); /* * All the operations are recorded with the operator used for * the modification. As we're going backwards, we do the @@ -684,6 +836,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); n++; + if (tm->slot > max_slot) + max_slot = tm->slot; break; case BTRFS_MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); @@ -693,14 +847,37 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, tm->generation); break; case BTRFS_MOD_LOG_KEY_ADD: + /* + * It is possible we could have already removed keys + * behind the known max slot, so this will be an + * overestimate. In practice, the copy operation + * inserts them in increasing order, and overestimating + * just means we miss some warnings, so it's OK. It + * isn't worth carefully tracking the full array of + * valid slots to check against when moving. + */ + if (tm->slot == max_slot) + max_slot--; /* if a move operation is needed it's in the log */ n--; break; case BTRFS_MOD_LOG_MOVE_KEYS: + ASSERT(tm->move.nr_items > 0); + move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1; + move_dst_end_slot = tm->slot + tm->move.nr_items - 1; o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); + if (WARN_ON(move_src_end_slot > max_slot || + tm->move.nr_items <= 0)) { + btrfs_warn(fs_info, +"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d", + eb->start, tm->slot, + tm->move.dst_slot, tm->move.nr_items, + tm->seq, n, max_slot); + } memmove_extent_buffer(eb, o_dst, o_src, tm->move.nr_items * p_size); + max_slot = move_dst_end_slot; break; case BTRFS_MOD_LOG_ROOT_REPLACE: /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 72a838c97534..4193ace3fb5a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -370,6 +370,8 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, { struct btrfs_fs_devices *fs_devs; + ASSERT(fsid || !metadata_fsid); + fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); @@ -380,18 +382,17 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); INIT_LIST_HEAD(&fs_devs->seed_list); - if (fsid) - memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); - if (metadata_fsid) - memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); - else if (fsid) - memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); + if (fsid) { + memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + memcpy(fs_devs->metadata_uuid, + metadata_fsid ?: fsid, BTRFS_FSID_SIZE); + } return fs_devs; } -void btrfs_free_device(struct btrfs_device *device) +static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); @@ -426,6 +427,21 @@ void __exit btrfs_cleanup_fs_uuids(void) } } +static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, + const u8 *fsid, const u8 *metadata_fsid) +{ + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) + return false; + + if (!metadata_fsid) + return true; + + if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) + return false; + + return true; +} + static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { @@ -435,19 +451,25 @@ static noinline struct btrfs_fs_devices *find_fsid( /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (metadata_fsid) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 - && memcmp(metadata_fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) - return fs_devices; - } else { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; - } + if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) + return fs_devices; } return NULL; } +/* + * First check if the metadata_uuid is different from the fsid in the given + * fs_devices. Then check if the given fsid is the same as the metadata_uuid + * in the fs_devices. If it is, return true; otherwise, return false. + */ +static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, + const u8 *fsid) +{ + return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; +} + static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( struct btrfs_super_block *disk_super) { @@ -461,14 +483,14 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( * at all and the CHANGING_FSID_V2 flag set. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(disk_super->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { + if (!fs_devices->fsid_change) + continue; + + if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, + fs_devices->fsid)) return fs_devices; - } } + /* * Handle scanned device having completed its fsid change but * belonging to a fs_devices that was created by a device that @@ -476,13 +498,11 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( * CHANGING_FSID_V2 flag set. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(fs_devices->metadata_uuid, - fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && - memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { + if (!fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) return fs_devices; - } } return find_fsid(disk_super->fsid, disk_super->metadata_uuid); @@ -673,18 +693,16 @@ static struct btrfs_fs_devices *find_fsid_inprogress( struct btrfs_fs_devices *fs_devices; list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->fsid, - BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { + if (fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->fsid)) return fs_devices; - } } return find_fsid(disk_super->fsid, NULL); } - static struct btrfs_fs_devices *find_fsid_changed( struct btrfs_super_block *disk_super) { @@ -701,10 +719,7 @@ static struct btrfs_fs_devices *find_fsid_changed( */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { /* Changed UUIDs */ - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, - BTRFS_FSID_SIZE) == 0 && + if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && memcmp(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE) != 0) return fs_devices; @@ -735,11 +750,10 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( * fs_devices equal to the FSID of the disk. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->fsid, - BTRFS_FSID_SIZE) == 0 && - fs_devices->fsid_change) + if (!fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->fsid)) return fs_devices; } @@ -790,12 +804,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (!fs_devices) { - if (has_metadata_uuid) - fs_devices = alloc_fs_devices(disk_super->fsid, - disk_super->metadata_uuid); - else - fs_devices = alloc_fs_devices(disk_super->fsid, NULL); - + fs_devices = alloc_fs_devices(disk_super->fsid, + has_metadata_uuid ? disk_super->metadata_uuid : NULL); if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); @@ -1918,7 +1928,7 @@ static void update_dev_time(const char *device_path) return; now = current_time(d_inode(path.dentry)); - inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION); path_put(&path); } @@ -6163,17 +6173,10 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->replace_nr_stripes = nr_extra_stripes; } -static bool need_full_stripe(enum btrfs_map_op op) -{ - return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); -} - static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, u64 offset, u32 *stripe_nr, u64 *stripe_offset, u64 *full_stripe_start) { - ASSERT(op != BTRFS_MAP_DISCARD); - /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. @@ -6226,11 +6229,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); } -int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map) +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6253,7 +6256,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 max_len; ASSERT(bioc_ret); - ASSERT(op != BTRFS_MAP_DISCARD); num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); if (mirror_num > num_copies) @@ -6285,21 +6287,21 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_index = stripe_nr % map->num_stripes; stripe_nr /= map->num_stripes; - if (!need_full_stripe(op)) + if (op == BTRFS_MAP_READ) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; - else { + } else { stripe_index = find_live_mirror(fs_info, map, 0, dev_replace_is_ongoing); mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (need_full_stripe(op)) { + if (op != BTRFS_MAP_READ) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -6313,7 +6315,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, stripe_index = (stripe_nr % factor) * map->sub_stripes; stripe_nr /= factor; - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -6326,7 +6328,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { + if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * Push stripe_nr back to the start of the full stripe * For those cases needing a full stripe, @stripe_nr @@ -6362,7 +6364,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* We distribute the parity blocks across stripes */ stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (!need_full_stripe(op) && mirror_num <= 1) + if (op == BTRFS_MAP_READ && mirror_num <= 1) mirror_num = 1; } } else { @@ -6402,7 +6404,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, */ if (smap && num_alloc_stripes == 1 && !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && - (!need_full_stripe(op) || !dev_replace_is_ongoing || + (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); *mirror_num_ret = mirror_num; @@ -6426,7 +6428,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * It's still mostly the same as other profiles, just with extra rotation. */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && - (need_full_stripe(op) || mirror_num > 1)) { + (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo @@ -6453,11 +6455,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && - need_full_stripe(op)) { + op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(op, bioc, dev_replace, logical, &num_stripes, &max_errors); } @@ -6477,23 +6479,6 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, int mirror_num) -{ - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, - NULL, &mirror_num, 0); -} - -/* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret) -{ - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, - NULL, NULL, 1); -} - static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, const struct btrfs_fs_devices *fs_devices) { @@ -8070,8 +8055,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ASSERT(mirror_num > 0); - ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, - &bioc, smap, &mirror_ret, true); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, + &bioc, smap, &mirror_ret, true); if (ret < 0) return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 64066d48dce1..d44cb9d22d63 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -280,8 +280,19 @@ enum btrfs_read_policy { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* + * UUID written into the btree blocks: + * + * - If metadata_uuid != fsid then super block must have + * BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag set. + * + * - Following shall be true at all times: + * - metadata_uuid == btrfs_header::fsid + * - metadata_uuid == btrfs_dev_item::fsid + */ u8 metadata_uuid[BTRFS_FSID_SIZE]; - bool fsid_change; + struct list_head fs_list; /* @@ -319,34 +330,32 @@ struct btrfs_fs_devices { */ struct btrfs_device *latest_dev; - /* all of the devices in the FS, protected by a mutex - * so we can safely walk it to write out the supers without - * worrying about add/remove by the multi-device code. - * Scrubbing super can kick off supers writing by holding - * this mutex lock. + /* + * All of the devices in the filesystem, protected by a mutex so we can + * safely walk it to write out the super blocks without worrying about + * adding/removing by the multi-device code. Scrubbing super block can + * kick off supers writing by holding this mutex lock. */ struct mutex device_list_mutex; /* List of all devices, protected by device_list_mutex */ struct list_head devices; - /* - * Devices which can satisfy space allocation. Protected by - * chunk_mutex - */ + /* Devices which can satisfy space allocation. Protected by * chunk_mutex. */ struct list_head alloc_list; struct list_head seed_list; - bool seeding; + /* Count fs-devices opened. */ int opened; - /* set when we find or add a device that doesn't have the - * nonrot flag set - */ + /* Set when we find or add a device that doesn't have the nonrot flag set. */ bool rotating; - /* Devices support TRIM/discard commands */ + /* Devices support TRIM/discard commands. */ bool discardable; + bool fsid_change; + /* The filesystem is a seed filesystem. */ + bool seeding; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -357,7 +366,7 @@ struct btrfs_fs_devices { enum btrfs_chunk_allocation_policy chunk_alloc_policy; - /* Policy used to read the mirrored stripes */ + /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; }; @@ -547,15 +556,12 @@ struct btrfs_dev_lookup_args { enum btrfs_map_op { BTRFS_MAP_READ, BTRFS_MAP_WRITE, - BTRFS_MAP_DISCARD, BTRFS_MAP_GET_READ_MIRRORS, }; static inline enum btrfs_map_op btrfs_op(struct bio *bio) { switch (bio_op(bio)) { - case REQ_OP_DISCARD: - return BTRFS_MAP_DISCARD; case REQ_OP_WRITE: case REQ_OP_ZONE_APPEND: return BTRFS_MAP_WRITE; @@ -589,15 +595,9 @@ void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, int mirror_num); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret); -int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map); + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num); @@ -628,7 +628,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid, const char *path); void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); -void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8acb05e176c5..6c231a116a29 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -63,7 +63,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN); workspace->level = level; workspace->buf = NULL; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 39828af4a4e8..85b8b332add9 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -15,6 +15,7 @@ #include "transaction.h" #include "dev-replace.h" #include "space-info.h" +#include "super.h" #include "fs.h" #include "accessors.h" #include "bio.h" @@ -1057,7 +1058,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, /* Check if zones in the region are all empty */ if (btrfs_dev_is_sequential(device, pos) && - find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { + !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) { pos += zinfo->zone_size; continue; } @@ -1156,23 +1157,23 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) struct btrfs_zoned_device_info *zinfo = device->zone_info; const u8 shift = zinfo->zone_size_shift; unsigned long begin = start >> shift; - unsigned long end = (start + size) >> shift; + unsigned long nbits = size >> shift; u64 pos; int ret; ASSERT(IS_ALIGNED(start, zinfo->zone_size)); ASSERT(IS_ALIGNED(size, zinfo->zone_size)); - if (end > zinfo->nr_zones) + if (begin + nbits > zinfo->nr_zones) return -ERANGE; /* All the zones are conventional */ - if (find_next_bit(zinfo->seq_zones, end, begin) == end) + if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits)) return 0; /* All the zones are sequential and empty */ - if (find_next_zero_bit(zinfo->seq_zones, end, begin) == end && - find_next_zero_bit(zinfo->empty_zones, end, begin) == end) + if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) && + bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits)) return 0; for (pos = start; pos < start + size; pos += zinfo->zone_size) { @@ -1602,37 +1603,17 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - - if (!btrfs_is_zoned(fs_info) || - btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || - !list_empty(&eb->release_list)) + if (!btrfs_is_zoned(eb->fs_info) || + btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) return; + ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + memzero_extent_buffer(eb, 0, eb->len); set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); - set_extent_bits_nowait(&trans->dirty_pages, eb->start, - eb->start + eb->len - 1, EXTENT_DIRTY); - - spin_lock(&trans->releasing_ebs_lock); - list_add_tail(&eb->release_list, &trans->releasing_ebs); - spin_unlock(&trans->releasing_ebs_lock); - atomic_inc(&eb->refs); -} - -void btrfs_free_redirty_list(struct btrfs_transaction *trans) -{ - spin_lock(&trans->releasing_ebs_lock); - while (!list_empty(&trans->releasing_ebs)) { - struct extent_buffer *eb; - - eb = list_first_entry(&trans->releasing_ebs, - struct extent_buffer, release_list); - list_del_init(&eb->release_list); - free_extent_buffer(eb); - } - spin_unlock(&trans->releasing_ebs_lock); + set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, + EXTENT_DIRTY | EXTENT_NOWAIT, NULL); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) @@ -1677,63 +1658,89 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_sum *sum = bbio->sums; - ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON(!ordered)) - return; - - ordered->physical = physical; - btrfs_put_ordered_extent(ordered); + if (physical < bbio->orig_physical) + sum->logical -= bbio->orig_physical - physical; + else + sum->logical += physical - bbio->orig_physical; } -void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) +static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, + u64 logical) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map_tree *em_tree; + struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree; struct extent_map *em; - struct btrfs_ordered_sum *sum; - u64 orig_logical = ordered->disk_bytenr; - struct map_lookup *map; - u64 physical = ordered->physical; - u64 chunk_start_phys; - u64 logical; - - em = btrfs_get_chunk_map(fs_info, orig_logical, 1); - if (IS_ERR(em)) - return; - map = em->map_lookup; - chunk_start_phys = map->stripes[0].physical; - - if (WARN_ON_ONCE(map->num_stripes > 1) || - WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || - WARN_ON_ONCE(physical < chunk_start_phys) || - WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { - free_extent_map(em); - return; - } - logical = em->start + (physical - map->stripes[0].physical); - free_extent_map(em); - - if (orig_logical == logical) - return; ordered->disk_bytenr = logical; - em_tree = &inode->extent_tree; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); em->block_start = logical; free_extent_map(em); write_unlock(&em_tree->lock); +} - list_for_each_entry(sum, &ordered->list, list) { - if (logical < orig_logical) - sum->bytenr -= orig_logical - logical; - else - sum->bytenr += logical - orig_logical; +static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, + u64 logical, u64 len) +{ + struct btrfs_ordered_extent *new; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset, + ordered->num_bytes, len, logical)) + return false; + + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return false; + new->disk_bytenr = logical; + btrfs_finish_one_ordered(new); + return true; +} + +void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_sum *sum = + list_first_entry(&ordered->list, typeof(*sum), list); + u64 logical = sum->logical; + u64 len = sum->len; + + while (len < ordered->disk_num_bytes) { + sum = list_next_entry(sum, list); + if (sum->logical == logical + len) { + len += sum->len; + continue; + } + if (!btrfs_zoned_split_ordered(ordered, logical, len)) { + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + btrfs_err(fs_info, "failed to split ordered extent"); + goto out; + } + logical = sum->logical; + len = sum->len; + } + + if (ordered->disk_bytenr != logical) + btrfs_rewrite_logical_zoned(ordered, logical); + +out: + /* + * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures + * were allocated by btrfs_alloc_dummy_sum only to record the logical + * addresses and don't contain actual checksums. We thus must free them + * here so that we don't attempt to log the csums later. + */ + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) { + while ((sum = list_first_entry_or_null(&ordered->list, + typeof(*sum), list))) { + list_del(&sum->list); + kfree(sum); + } } } @@ -1792,8 +1799,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, int nmirrors; int i, ret; - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &mapped_length, &bioc, NULL, NULL, 1); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index c0570d35fea2..27322b926038 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -30,6 +30,8 @@ struct btrfs_zoned_device_info { struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; +void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered); + #ifdef CONFIG_BLK_DEV_ZONED int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); @@ -54,10 +56,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); -void btrfs_free_redirty_list(struct btrfs_transaction *trans); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); -void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, struct btrfs_block_group **cache_ret); @@ -179,7 +179,6 @@ static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } -static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { @@ -190,9 +189,6 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } -static inline void btrfs_rewrite_logical_zoned( - struct btrfs_ordered_extent *ordered) { } - static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, struct btrfs_block_group **cache_ret) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index f798da267590..e7ac4ec809a4 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -356,7 +356,7 @@ struct list_head *zstd_alloc_workspace(unsigned int level) workspace->level = level; workspace->req_level = level; workspace->last_used = jiffies; - workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); + workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; |